{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T00:46:35.302962Z",
     "start_time": "2020-05-14T00:46:34.133795Z"
    }
   },
   "outputs": [],
   "source": [
    "import string, re, os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sn\n",
    "import jieba\n",
    "import operator\n",
    "import zhconv\n",
    "from collections import Counter\n",
    "import warnings\n",
    "warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')\n",
    "\n",
    "from gensim.models import Word2Vec\n",
    "\n",
    "from tensorflow.keras.preprocessing import sequence\n",
    "from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation\n",
    "from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D\n",
    "from tensorflow.keras.models import Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T00:46:37.220009Z",
     "start_time": "2020-05-14T00:46:37.214597Z"
    }
   },
   "outputs": [],
   "source": [
    "model_path = r'../models/zhwiki.50d.word2vec'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T00:46:45.811420Z",
     "start_time": "2020-05-14T00:46:45.805922Z"
    }
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "plt.rcParams['font.sans-serif'] = ['SimHei']  #用来正常显示中文标签\n",
    "plt.rcParams['axes.unicode_minus'] = False  #用来正常显示负号"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 影评数据\n",
    "豆瓣的电影短评论"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T00:47:03.015065Z",
     "start_time": "2020-05-14T00:47:03.010507Z"
    }
   },
   "outputs": [],
   "source": [
    "# 文件路径\n",
    "data_path = \"../datasets/\"\n",
    "comments = ['douban_movie_comments.csv', 'douban_movie_short_comments.csv']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T00:47:07.825112Z",
     "start_time": "2020-05-14T00:47:07.415059Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 261497 entries, 0 to 261496\n",
      "Data columns (total 5 columns):\n",
      " #   Column   Non-Null Count   Dtype \n",
      "---  ------   --------------   ----- \n",
      " 0   id       261497 non-null  object\n",
      " 1   link     261497 non-null  object\n",
      " 2   name     261497 non-null  object\n",
      " 3   comment  261495 non-null  object\n",
      " 4   star     261497 non-null  object\n",
      "dtypes: object(5)\n",
      "memory usage: 10.0+ MB\n",
      "None\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>link</th>\n",
       "      <th>name</th>\n",
       "      <th>comment</th>\n",
       "      <th>star</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>吴京意淫到了脑残的地步，看了恶心想吐</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>首映礼看的。太恐怖了这个电影，不讲道理的，完全就是吴京在实现他这个小粉红的英雄梦。各种装备轮...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>吴京的炒作水平不输冯小刚，但小刚至少不会用主旋律来炒作…吴京让人看了不舒服，为了主旋律而主旋...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>凭良心说，好看到不像《战狼1》的续集，完虐《湄公河行动》。</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>https://movie.douban.com/subject/26363254/</td>\n",
       "      <td>战狼2</td>\n",
       "      <td>中二得很</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  id                                        link name  \\\n",
       "0  1  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "1  2  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "2  3  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "3  4  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "4  5  https://movie.douban.com/subject/26363254/  战狼2   \n",
       "\n",
       "                                             comment star  \n",
       "0                                 吴京意淫到了脑残的地步，看了恶心想吐    1  \n",
       "1  首映礼看的。太恐怖了这个电影，不讲道理的，完全就是吴京在实现他这个小粉红的英雄梦。各种装备轮...    2  \n",
       "2  吴京的炒作水平不输冯小刚，但小刚至少不会用主旋律来炒作…吴京让人看了不舒服，为了主旋律而主旋...    2  \n",
       "3                      凭良心说，好看到不像《战狼1》的续集，完虐《湄公河行动》。    4  \n",
       "4                                               中二得很    1  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = pd.read_csv(os.path.join(data_path, comments[0]), low_memory=False)\n",
    "print(df1.info())\n",
    "df1.head(5)\n",
    "\n",
    "# 目标数据 comment，star 的非空数量不同，数据类型都为 object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T00:48:17.110181Z",
     "start_time": "2020-05-14T00:48:13.160530Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 2125056 entries, 0 to 2125055\n",
      "Data columns (total 10 columns):\n",
      " #   Column         Dtype \n",
      "---  ------         ----- \n",
      " 0   ID             int64 \n",
      " 1   Movie_Name_EN  object\n",
      " 2   Movie_Name_CN  object\n",
      " 3   Crawl_Date     object\n",
      " 4   Number         int64 \n",
      " 5   Username       object\n",
      " 6   Date           object\n",
      " 7   Star           int64 \n",
      " 8   Comment        object\n",
      " 9   Like           int64 \n",
      "dtypes: int64(4), object(6)\n",
      "memory usage: 162.1+ MB\n",
      "None\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Movie_Name_EN</th>\n",
       "      <th>Movie_Name_CN</th>\n",
       "      <th>Crawl_Date</th>\n",
       "      <th>Number</th>\n",
       "      <th>Username</th>\n",
       "      <th>Date</th>\n",
       "      <th>Star</th>\n",
       "      <th>Comment</th>\n",
       "      <th>Like</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>Avengers Age of Ultron</td>\n",
       "      <td>复仇者联盟2</td>\n",
       "      <td>2017-01-22</td>\n",
       "      <td>1</td>\n",
       "      <td>然潘</td>\n",
       "      <td>2015-05-13</td>\n",
       "      <td>3</td>\n",
       "      <td>连奥创都知道整容要去韩国。</td>\n",
       "      <td>2404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Avengers Age of Ultron</td>\n",
       "      <td>复仇者联盟2</td>\n",
       "      <td>2017-01-22</td>\n",
       "      <td>2</td>\n",
       "      <td>更深的白色</td>\n",
       "      <td>2015-04-24</td>\n",
       "      <td>2</td>\n",
       "      <td>非常失望，剧本完全敷衍了事，主线剧情没突破大家可以理解，可所有的人物都缺乏动机，正邪之间、...</td>\n",
       "      <td>1231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>Avengers Age of Ultron</td>\n",
       "      <td>复仇者联盟2</td>\n",
       "      <td>2017-01-22</td>\n",
       "      <td>3</td>\n",
       "      <td>有意识的贱民</td>\n",
       "      <td>2015-04-26</td>\n",
       "      <td>2</td>\n",
       "      <td>2015年度最失望作品。以为面面俱到，实则画蛇添足；以为主题深刻，实则老调重弹；以为推陈出...</td>\n",
       "      <td>1052</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>Avengers Age of Ultron</td>\n",
       "      <td>复仇者联盟2</td>\n",
       "      <td>2017-01-22</td>\n",
       "      <td>4</td>\n",
       "      <td>不老的李大爷耶</td>\n",
       "      <td>2015-04-23</td>\n",
       "      <td>4</td>\n",
       "      <td>《铁人2》中勾引钢铁侠，《妇联1》中勾引鹰眼，《美队2》中勾引美国队长，在《妇联2》中终于...</td>\n",
       "      <td>1045</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>Avengers Age of Ultron</td>\n",
       "      <td>复仇者联盟2</td>\n",
       "      <td>2017-01-22</td>\n",
       "      <td>5</td>\n",
       "      <td>ZephyrO</td>\n",
       "      <td>2015-04-22</td>\n",
       "      <td>2</td>\n",
       "      <td>虽然从头打到尾，但是真的很无聊啊。</td>\n",
       "      <td>723</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID           Movie_Name_EN Movie_Name_CN  Crawl_Date  Number Username  \\\n",
       "0   0  Avengers Age of Ultron        复仇者联盟2  2017-01-22       1       然潘   \n",
       "1   1  Avengers Age of Ultron        复仇者联盟2  2017-01-22       2    更深的白色   \n",
       "2   2  Avengers Age of Ultron        复仇者联盟2  2017-01-22       3   有意识的贱民   \n",
       "3   3  Avengers Age of Ultron        复仇者联盟2  2017-01-22       4  不老的李大爷耶   \n",
       "4   4  Avengers Age of Ultron        复仇者联盟2  2017-01-22       5  ZephyrO   \n",
       "\n",
       "         Date  Star                                            Comment  Like  \n",
       "0  2015-05-13     3                                      连奥创都知道整容要去韩国。  2404  \n",
       "1  2015-04-24     2   非常失望，剧本完全敷衍了事，主线剧情没突破大家可以理解，可所有的人物都缺乏动机，正邪之间、...  1231  \n",
       "2  2015-04-26     2   2015年度最失望作品。以为面面俱到，实则画蛇添足；以为主题深刻，实则老调重弹；以为推陈出...  1052  \n",
       "3  2015-04-23     4   《铁人2》中勾引钢铁侠，《妇联1》中勾引鹰眼，《美队2》中勾引美国队长，在《妇联2》中终于...  1045  \n",
       "4  2015-04-22     2                                  虽然从头打到尾，但是真的很无聊啊。   723  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = pd.read_csv(os.path.join(data_path, comments[1]), low_memory=False)\n",
    "print(df2.info())\n",
    "df2.head(5)\n",
    "\n",
    "# 目标数据 Comment， Star"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T00:50:19.719739Z",
     "start_time": "2020-05-14T00:50:19.631230Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Movie_Name_EN</th>\n",
       "      <th>Movie_Name_CN</th>\n",
       "      <th>Crawl_Date</th>\n",
       "      <th>Number</th>\n",
       "      <th>Username</th>\n",
       "      <th>Date</th>\n",
       "      <th>star</th>\n",
       "      <th>comment</th>\n",
       "      <th>Like</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>Avengers Age of Ultron</td>\n",
       "      <td>复仇者联盟2</td>\n",
       "      <td>2017-01-22</td>\n",
       "      <td>1</td>\n",
       "      <td>然潘</td>\n",
       "      <td>2015-05-13</td>\n",
       "      <td>3</td>\n",
       "      <td>连奥创都知道整容要去韩国。</td>\n",
       "      <td>2404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Avengers Age of Ultron</td>\n",
       "      <td>复仇者联盟2</td>\n",
       "      <td>2017-01-22</td>\n",
       "      <td>2</td>\n",
       "      <td>更深的白色</td>\n",
       "      <td>2015-04-24</td>\n",
       "      <td>2</td>\n",
       "      <td>非常失望，剧本完全敷衍了事，主线剧情没突破大家可以理解，可所有的人物都缺乏动机，正邪之间、...</td>\n",
       "      <td>1231</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID           Movie_Name_EN Movie_Name_CN  Crawl_Date  Number Username  \\\n",
       "0   0  Avengers Age of Ultron        复仇者联盟2  2017-01-22       1       然潘   \n",
       "1   1  Avengers Age of Ultron        复仇者联盟2  2017-01-22       2    更深的白色   \n",
       "\n",
       "         Date  star                                            comment  Like  \n",
       "0  2015-05-13     3                                      连奥创都知道整容要去韩国。  2404  \n",
       "1  2015-04-24     2   非常失望，剧本完全敷衍了事，主线剧情没突破大家可以理解，可所有的人物都缺乏动机，正邪之间、...  1231  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.rename(columns={\"Comment\": \"comment\", \"Star\": \"star\"}, inplace=True)\n",
    "df2.head(2)\n",
    "\n",
    "# 目标数据列 命名一致"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T00:50:23.511614Z",
     "start_time": "2020-05-14T00:50:23.317616Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 2386553 entries, 0 to 2386552\n",
      "Data columns (total 2 columns):\n",
      " #   Column   Dtype \n",
      "---  ------   ----- \n",
      " 0   comment  object\n",
      " 1   star     object\n",
      "dtypes: object(2)\n",
      "memory usage: 36.4+ MB\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>comment</th>\n",
       "      <th>star</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>吴京意淫到了脑残的地步，看了恶心想吐</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>首映礼看的。太恐怖了这个电影，不讲道理的，完全就是吴京在实现他这个小粉红的英雄梦。各种装备轮...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>吴京的炒作水平不输冯小刚，但小刚至少不会用主旋律来炒作…吴京让人看了不舒服，为了主旋律而主旋...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>凭良心说，好看到不像《战狼1》的续集，完虐《湄公河行动》。</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>中二得很</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>“犯我中华者，虽远必诛”，吴京比这句话还要意淫一百倍。</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>脑子是个好东西，希望编剧们都能有。</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>三星半，实打实的7分。第一集在爱国主旋律内部做着各种置换与较劲，但第二集才真正显露吴京的野心...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>开篇长镜头惊险大气引人入胜 结合了水平不俗的快剪下实打实的真刀真枪 让人不禁热血沸腾 特别弹...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>15/100吴京的冷峰在这部里即像成龙，又像杰森斯坦森，但体制外的同类型电影，主角总是代表个...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             comment star\n",
       "0                                 吴京意淫到了脑残的地步，看了恶心想吐    1\n",
       "1  首映礼看的。太恐怖了这个电影，不讲道理的，完全就是吴京在实现他这个小粉红的英雄梦。各种装备轮...    2\n",
       "2  吴京的炒作水平不输冯小刚，但小刚至少不会用主旋律来炒作…吴京让人看了不舒服，为了主旋律而主旋...    2\n",
       "3                      凭良心说，好看到不像《战狼1》的续集，完虐《湄公河行动》。    4\n",
       "4                                               中二得很    1\n",
       "5                        “犯我中华者，虽远必诛”，吴京比这句话还要意淫一百倍。    1\n",
       "6                                  脑子是个好东西，希望编剧们都能有。    2\n",
       "7  三星半，实打实的7分。第一集在爱国主旋律内部做着各种置换与较劲，但第二集才真正显露吴京的野心...    4\n",
       "8  开篇长镜头惊险大气引人入胜 结合了水平不俗的快剪下实打实的真刀真枪 让人不禁热血沸腾 特别弹...    4\n",
       "9  15/100吴京的冷峰在这部里即像成龙，又像杰森斯坦森，但体制外的同类型电影，主角总是代表个...    1"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 提取目标数据，评论和评分\n",
    "\n",
    "dataset = df1[['comment', 'star']].append(df2[['comment', 'star']],\n",
    "                                          ignore_index=True)\n",
    "dataset.info()\n",
    "dataset.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据清洗"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T00:51:03.670623Z",
     "start_time": "2020-05-14T00:51:03.050307Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 2386550 entries, 0 to 2386552\n",
      "Data columns (total 2 columns):\n",
      " #   Column   Dtype \n",
      "---  ------   ----- \n",
      " 0   comment  object\n",
      " 1   star     int64 \n",
      "dtypes: int64(1), object(1)\n",
      "memory usage: 54.6+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "# 删除评论为空的行\n",
    "dataset = dataset.dropna()\n",
    "\n",
    "# 删除评分不为数值的行\n",
    "dataset = dataset[dataset['star'].isin(list('12345') + list(range(1, 6)))]\n",
    "\n",
    "# 格式转换\n",
    "dataset['star'] = dataset['star'].astype('int')\n",
    "dataset['comment'] = dataset['comment'].astype('str')\n",
    "\n",
    "print(dataset.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据类别不均匀\n",
    "> **如何处理这种不平衡?**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:44:43.592060Z",
     "start_time": "2020-05-14T01:44:43.326260Z"
    }
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3de3hU5bn38e8NEYUACYdIVMCAsovIQW1koyInxYqCRcWqrwUtWgISPBWqVttaYSPdu1oOVikWoaIgKAehCFXfAqYalECpRX1R2IABBJICCYcgkNzvHzOEhCRkwEkGXL/Pdc3lWvc6zPMgrN+sZ601Y+6OiIgEV41YN0BERGJLQSAiEnAKAhGRgFMQiIgEnIJARCTgFAQi1cTMrJxaXCzaIlKSgkC+M8yssZnVCE+nmNl/hKfjjj3gWsiZFeyndfi/Lc2sRwXrpJvZWWb2NzNrZ2YjzKy+mf3RzLpU0MROZvbuMbVlZnZZJf2qtD3HrF/DzJaZ2fmVrSsCCgI5jZjZ783sUzNbGn6tNrOXwssaAEuBO8OrtwQWm1lz4F4gy8xWmtkuM1sLrARWmlm9Y97jRuDN8Kd3ByaZWZ1ymhMHPAkcBuKBO9w9H+gBbK6gC+2AD0q8VyLQGPjHcfocaXtK6gXUcfdNlawnAoT+MoucLooIfXg58ve2JnDQzC4EXgAedPf/C+DufzOzXwKXAy8BM919t5nNA8YCy4B27r7nyM7DZxO/Bp7w0JOWG8zs7fD6g0qsdxawldCBuQHQDXg/fCZQ193/N7zeme7+TXh6QbgtBWbWFxgF1AcSgS/Do0bJ7l73JNpzBTAD2EsomP4D+NrMVpf4s7Pwn9tQd18a+R+5BIHpyWI5XZhZZ6AVkBourQCygVrAKmA38EvgN8Dh8MGT8IF3gLvfUiIIDgGvA//h7gXh9X4O9HT3niXesy6wHPgrMNzd3czqA6OBjoQO7tOAnUATQmcE2UBzYA9wibvvMbPPgCvDYTQKWA/cAwxy97Xh91rv7heUeO+I2nPMn9GdwEB372lmfwIecPf9J/HHLQGioSE5LZjZ/cBzwN3ANUBPQgfS/wIudfftwO+B7YSC4P+Z2aHwgfMtoP2RsfawocDoEiFwDfAQJT5pA7j7XkIH92sJfer/HlAA/C+wBcggFEC7gAuA5909FfgboeGiI2ccRcd0qWto96EQOHadE2zPkW1SgJEltulO6KxF5Lg0NCSnBXd/wcw+B75/zKI17r7YzIYBtdx9Qrj+CzPbCHxD6APP3UBOie3+Aswxs5pAHWASobOJTDOrTWgoJT+8bkNCB+XLgUKgKZAL/Bx4mtDwTV/gjBLta07oU39FPgb+ZGaT3f3ecK0IIHzd4kTag5mdAywEzgfmhoeamgLLzczD7env7guP0yYJKA0NyWnDzJ4FzgSywqXWQD1CwzLtgX6EDpI7wkM4G4ELgbeBJEIHzQuAHYSGbWoBb7j7SDOr5e4Hw+/z38AWdx8Xnl9M6Ozh/fD894E/ERrfr0FoKOgQcAOhC9bXA++7e7sSbV9D6NN5IZAMPObuU81sOfBrd/+rma119++F1z+R9nQA5gDjgJ+7e9NwfR3Q1t0PmNlU4DV3P/auJRENDclppZDQcMcgYAihu2MMeB64ldCB/V1CB+IjGgGj3P3S8JDNMiDN3VPdvb27jwQ4ctAN6wp8VGL+PErcCeTuK4HOQB6hC7P/DawNDzPNAeYRGho61tXufgmhEDniWWBA+AL03hLvEXF7wu34hbuPL+c9S9KnPimXgkBOJ88A/QkdeD8HRgCPhK8P1CY0NLLY3ReF148DFgFnR/oGZvZDoKa7Ly9RPofQ9YCSziV0ZrIY+COh21EhFARdgAURvuVcYCCQwNGhnxNqj7tvdPeZ4dky/6bNrBahQDwcYZskYBQEcloIPyfwJqHhoBqExtPbAj8zs0uAD4GF7v7z8PrfIzRmn+bub5bcVfhV3nv8kNBtqEPC8/XN7ErgwJHbQEtYR+iuozOBJ4CLzewWYCYwnND9/leXWP8MICN8S+d94XnC/TBCZzHrvkV7jij5kFxc+H3WErpG8EkF20jQubteep3yL+AiQnfKtAC2AfeF63cSOoD2KGebhHJqi4Hry6mPAz4Fvl+i9nNgDaFbT0uuWwdYQugupeRw7cfAe4RuR4XQswXvAvHh+fuAM8PTHUqsdw6hs4ifA/VPpj3HtO3MEtNbCD3ncEas///pdWq/dLFYTjtmVtdDt1Eeebirprvv+5b7rEfok/ahaLTx2zrV2iPfbQoCEZGA0zUCEZGAUxCIiATcafdkcePGjT0lJSXWzRAROa2sXLky192Tylt22gVBSkoKWVlZla8oIqe8L774guTkZOrXrx/rpnznmVmFX0uuoSGRAFuxYgU//OEP+cEPfsCiRaHn8CZOnEhycjKpqals2LAh4n3dfffdTJ06FYDCwkJ++tOfkpSUxG233cY335T/2MP//M//cNZZZ3Hbbbdx9dVXc/jwYfLz81m8ePG37ptETkEgElB5eXncddddPPTQQzz++OPcfffdZGRkMHLkSFatWsWECRNIT0+PaF9/+9vfeOWVV4rnJ06cyMaNG9myZQuXXXYZ48aNK7PN0qVLueqqq9i/fz8HDx7ksssu49NPP2X27NlceeWVUeunVE5BIBJQX3/9NaNGjaJ79+5069aNZs2asXTpUgYMGMC5557LFVdcQW5uLvv2Hf8RjQMHDjBs2DBuueWW4tqcOXMYPnw4tWrV4v777+ett94qs920adPo378/+fn5JCYmkpCQQH5+Pnv37tVQUTU77a4RiEh0tG7dmtatW1NYWMi8efM4dOgQX331FT16HP1Z5HPPPZdNmzbRpk2bCvczcuRIbr31VmrUOPq5cvPmzbRv3x6AhIQEcnJySm3zl7/8heuvv56aNWuSmJjIzp07qVevHqtWrSr1/lI9dEYgEnDjxo3jxz/+MWlpabh7qU/j8fHx7N69u8Jt16xZw8KFC3nyySdL1QsLC0vt5/Dho9935+7MmTOHfv36AVC/fn0aNWrEF198QVFREWPGjKF///7R6p5EQEEgEnCPPPIIX375JaNHj+bf//53qQN/QUFBqU/6Jbk7Q4YMYeLEidSqVavUsgYNGpTaz4EDB4qnZ82aRb9+/Qj/eA4AU6dOZcqUKSQkJLBv3z7y8vLIzy/zZaxSRTQ0JBJQ69atY//+/bRv356mTZty+eWX07dvXzIzM7nrrrtwd1atWsV5551X7vbZ2dn885//pG/fvgDs3buXmjVrkpubS2pqKpmZmfTr148NGzZQp04dIHSm8M477zB58uQy+5s/fz7XXXcdy5Ytw93Jy8vTtYJqoiAQCaitW7eSlpZGZmYmBQUFrFixgvHjx/OLX/yCHj16sGbNGho1akSzZs3K3b558+alPrU/9dRTpKSkcM899/D++++TlpZGUlISY8eOLQ6LV155hQEDBpTZ16FDoe/Wa9iwIXv27KGoqIh69epVQa9PHXv37iU7O5uLLroo1k3R0JBIUHXp0oUBAwZw0UUXcfXVVzN27FiaN2/Om2++yXPPPceyZct47bXXAPjwww+58cYbT2jfw4cP58EHH6RevXr8+te/5uDBg2RmZtK1a9cy6y9ZsoQ+ffrQoEEDCgsLMTMSExOj1tfyjBkzhoSEBBITE3n22Wc5ePAgaWlpNG3alHbt2pGRkVHpPoYNG0Z8fDxNmjRh+vTpldZLev755wEYO3YsLVu2ZP360E9cz5w5s9z1q9Jp9+2jqamprieLRU5MymOx/816LzxM4f7dxNVrfNL72Dgm8jA6nszMTAYPHszcuXM5cOAAnTp1YsSIEaxdu5apU6eyevVq+vTpwxdffFHhmcmMGTOYPHkyr776KmvXrqVPnz7k5OQwZ86ccutnnnn0N4N27drFb37zG8aOHUv37t1JT09n8+bN9OjRg9zcXLp37x6VfpZkZis99HOtZWhoSESqhdWM+1YhEE116tRh+vTptGzZEgh9dc3o0aN57733iIuLIzU1lcaNG7N27VpSU8s9dtKsWTOmTJlCcnIyycnJmBm7d++usN6kSZPibceOHcuDDz4IhC66N2jQgM8++4wlS5bwwAMPVP0fwDEUBCISOB06dCie3rp1K9nZ2bRt25bc3FwA9uzZw6ZNm2jcuOLg6ty5c/H0xx9/TMOGDWnSpEmpA37J+hHbt2+noKCAFi1aAFCjRg1ycnKoUaNGzC6O6xqBiATaE088QVpaGvfeey/p6en89re/pWfPnrRu3ZpIv+n4scce49FHH42oPnbsWB5++OHi+dtvv52nnnqKQ4cOsXXrVi677LJqv3W20iAwsyFmtjT8Wm1mfzSzyWaWaWZPllgvqjURkaq2aNEi/v73v/PLX/6SwYMHM378eHJycli1alW5B/byvPjiixw8eJBBgwZVWv/qq6+Ii4vjnHPOKa6lpaXx+eef06hRI9577z3uvPNOli1bFp0ORqjSIHD3F929m7t3AzKA9YR+I/YKoKWZtTKzW6JZq6K+iogUy87O5r777uO1114jPj4egJtvvpmuXbty8cUXl/rupIqsXr2ap59+mmnTppV68K6i+tixY3nooYfK7GfJkiV069YNd6dRo0bHfZq7KkQ8NGRm5wFNgKbArHD5HaAz0C3KNRGRKvPNN99w0003MWLECDp27Fhq2TPPPMPo0aNLPflcnn//+9/06dOHF154oXi8/3j1L7/8kkaNGtGoUaMy+/rkk09o164dZhaTB+lO5BrBUOBFIB7YEq7tJBQO0a6VYmaDzCzLzLKO/fIqEZETtXjxYlavXs2YMWOK7+6ZPXs2b7/9NrVq1aJXr17F62ZnZxd/gV5Jr776Klu3bmXIkCHF+/j4448rrI8fP55hw4aV2c+uXbs4//zzAbj++uuZNGkSV199ddV1vhwRPUdgZjWAD4ArgbHADHdfHh7WaU3o4B21mruPrqgteo5A5MSdCs8RRMPJPEdwqvT9cN524hLKfM49Id/mOYrjPUcQ6RnB1cBHHkqNlRwdvukAbKyCmojId8q3DYGqFOlzBD8A3g9PzwMyzOxcoBfQCfAo10REpJpEdEbg7r9w9znh6XxCF3iXA93dPS/ateh1T0REKnNSTxa7+y6O3ulTJTUREakeerJYRCTgFAQiIgGnIBARCTgFgYhIwCkIREQCTkEgIhJwCgIRkYBTEIiIBJyCQEQk4BQEIiIBpyAQEQk4BYGISMApCEREAk5BICIScAoCEZGAUxCIiAScgkBEJOAUBCIiAacgEBEJOAWBiEjARRwEZvaCmfUJT082s0wze7LE8qjWRESkekQUBGZ2NZDs7gvM7BagprtfAbQ0s1bRrlVJT0VEpFyVBoGZnQG8BGw0sx8C3YBZ4cXvAJ2roCYiItUkkjOCAcBnwH8DHYGhwJbwsp1AEyA+yrVSzGyQmWWZWVZOTk6kfRMRkQhEEgSXApPcfRvwKvA+UDu8rG54H3ujXCvF3Se5e6q7pyYlJUXcORERqVwkQbAOaBmeTgVSODp80wHYCKyMck1ERKpJXATrTAZeNrM7gDMIjenPN7NzgV5AJ8CBjCjWRESkmlR6RuDue9z9Nnfv4u5XuPsmQmGwHOju7nnunh/NWrQ7KSIiFYvkjKAMd9/F0Tt9qqQmIiLVQ08Wi4gEnIJAJIaKiopYtWpVrJshAacgkEAbM2YMCQkJJCYm8uyzz1ZYq8yGDRto3rw5eXllL3FNmTKFe+65p9ztpk+fTm5uLrNnz+a8884jMzMTgJkzZ55ch0ROwkldIxD5LsjMzGTGjBn84x//4MCBA3Tq1ImOHTuWqf3oRz+iWbNmx93XAw88wOOPP05CQkKpek5ODiNGjKB3795ltjl8+DBLlixh8uTJ9O3bl2nTpjFr1ixatWpFUVFRVPsqcjwKAgmsOnXqMH36dFq2DD0mk5KSQlFRUZna1q1bjxsE8+fPZ8OGDQwaNKjMsocffpiePXuWu93LL7/MwIEDAcjPz6d58+bk5+czZ84c+vfv/227JxIxDQ1JYHXo0IGLL74YgK1bt5KdnU1qamqZWtu2bSvcx6FDh3j44Yc588wzufXWW0sN6bz77rts3ry53IA4cOAAK1eu5KqrrgIgMTGRdevWUa9ePQ4dOkTt2rXLbCNSVRQEIsATTzxBWloa8fHxx60d64033mD79u0MHTqUfv36kZ6ezuLFiykoKOCRRx7hpZdewszKbDdx4kQGDx5cPD9w4ECGDh1KgwYNKCgooHXr1mzcuDGqfRSpiIaGJPAWLVrE3//+d1avXn3cWnk++ugjfvrTnxYP8axfv54FCxawdOlS+vfvT6tWrdiyZUupbfbu3cu6det46KGHimu9e/emd+/ejBs3jqVLlzJ8+HAWLFjAsGHDothTkfIpCCTQsrOzue+++5g7d27xJ//yahWpX78+DRs2LJ4/66yzSExMZN68eeTm5vLcc89x8OBBCgoKisf/J0yYQHp6epl9rV+/ngsvvJC33nqLFi1a8OGHH0a3syIV0NCQBNY333zDTTfdxIgRI+jYsWOFtePp0qULb775Jvv27WPnzp28+uqrdO3alU8//ZTt27ezbds25syZw+23386cOXPYtWsXO3bsoHXr1mX2tWjRInr16kW9evXYvHkz9evXj3qfRcqjIJDAWrx4MatXr2bMmDEkJyeTnJzM66+/XqY2e/ZssrOzad++fZl99OzZk759+9K2bVtatWpF7969ue666yp8z7Fjx/Lggw+WqRcWFlK/fn1q1KjBHXfcwa9+9StuuOGGqPZXpCLm7rFuwwlJTU31rKysWDdDTjMpjy2MdRMAOJy3nbiEMr+9dEI2jrnxhLc5Vfr/bQW573By/T/CzFa6e2p5y3RGIFKNvm0IiFQFBYGISMApCEREAk5BICIScAoCEZGAUxCIiAScgkBEJOCOGwRmFmdmX5nZ0vCrnZn9xsxWmNkfSqwX1ZqIiFSfys4I2gMz3L2bu3cDagGdgY7ADjO71sy+H81aFfRRRESOo7IvnesE9Daz7sC/gLXAbHd3M/sr0AvIi3LtvSrop4iIVKCyM4IVwLXu3hE4A6gNHPlO3Z1AEyA+yrUyzGyQmWWZWVZOTk7EnRMRkcpVFgSfuPvX4eksYC+hMACoG94+2rUy3H2Su6e6e2pSUlJkPRMRkYhUFgTTzKyDmdUE+hL6BN85vKwDsBFYGeWaiIhUo8quETwNTAcMmA+MAjLMbBxwffi1CXgmijUREalGxz0jcPc17t7e3du5+xPuXgRcC2QAvdx9Q7RrVdZTEREp1wn/VKW7FwBvVmVNRESqj54sFhEJOAWBiEjAKQhERAJOQSAiEnAKAhGRgFMQiIgEnIJARCTgFAQiIgGnIBARCTgFgYhIwCkIREQCTkEgIhJwCgIRkYBTEIiIBJyCQEQk4BQEIiIBpyAQEQk4BYGISMApCEREAk5BICIScBEFgZk1MbN/hKcnm1mmmT1ZYnlUayIiUn0iPSP4HVDbzG4Barr7FUBLM2sV7Vr0uygiIscTV9kKZtYD2AdsA7oBs8KL3gE6A5dGufblyXZGRERO3HHPCMysFvBL4LFwKR7YEp7eCTSpglp57RhkZllmlpWTkxNp30REJAKVDQ09Brzg7rvD83uB2uHpuuHto10rw90nuXuqu6cmJSVF1jMREYlIZUFwLTDUzJYClwB9CA3fAHQANgIro1wTEZFqdNxrBO7e5ch0OAxuAjLM7FygF9AJ8CjXRESkGkX8HIG7d3P3fEIXjJcD3d09L9q1aHVMREQiU+ldQ8dy910cvdOnSmoiIlJ99GSxiEjAKQhERAJOQSAiEnAKAhGRgFMQiIgEnIJARCTgFAQiIgGnIBARCTgFgYhIwCkIREQCTkEgIhJwCgIRkYBTEIiIBJyCQEQk4BQEIiIBpyAQEQk4BYGISMApCEREAi6iIDCzhmbW08waV3WDRESkelUaBGbWAPgL0BFYYmZJZjbZzDLN7MkS60W1JiIi1SOSM4L2wCPu/l/AX4EeQE13vwJoaWatzOyWaNaqoqMiIlK+uMpWcPdlAGbWhdBZQUNgVnjxO0Bn4NIo17482Q6JiMiJifQagQG3A7sAB7aEF+0EmgDxUa4d+/6DzCzLzLJycnIi7ZuIiEQgoiDwkKHAJ8CVQO3worrhfeyNcu3Y95/k7qnunpqUlBRx50REpHKRXCx+1MwGhGcTgTGEhm8AOgAbgZVRromISDWp9BoBMAmYZWb3AWuAecD7ZnYu0AvoRGi4KCOKNRERqSaVnhG4+y537+nuXdz9fnfPA7oBy4Hu7p7n7vnRrEW7kyIiUrFIzgjKcPddHL3Tp0pqIiJSPfQVEyIiAacgEBEJOAWBiEjAKQhERAJOQSAiEnAKAhGRgFMQiIgEnIJARCTgFAQiIgGnIIiRrVu3snXr1lg3Q0REQVDSwYMHSUtLo2nTprRr146MjIzjrv+rX/2Ks88+m2uuuYadO3cW11977TWuvfZabrvtNr78svzf2Pnd735HrVq1+NnPfkbbtm3ZtWsXRUVFvPnmm1Htk4hIZRQEJUyaNIl9+/axceNGpkyZwo9+9CP27NlT7roLFy5k7ty5rFu3jvvuu48nnwz93PIHH3zA73//eyZMmECXLl0YMGBAmW2//PJLGjZsSOPGjcnKyuLee+/l/fffZ9GiRVx66aVV2sdTRVFREatWrYp1M0QEBUEpM2fOZMiQIcTFxZGamkrjxo1Zu3ZtuevOmTOH9PR06tevzx133FF89pCXl8cf//hHLrroIn7yk5/wxRdflNl2/PjxPPDAAwCYGQkJCeTn57Nu3TouuOCCqutgBJ5++mmaNm1K06ZNmTJlSoXrpaenk5ycXPyqXbs2r7zySvHyUaNG8dRTT1W4/fTp08nNzWX27Nmcd955ZGZmAqH/ByJSvRQEx8jNzQVgz549bNq0icaNG5e73ubNm2nfvj0QOpifccYZ7Nu3jxtuuIHvf//7FBQUMHLkSPr161dqu3/961+kpKRQv359IPTJeOfOneTm5tKqVasq7Fnl1qxZw7Jly9i0aROZmZk8+uijbN++vdx1n3/+ebZt28a2bdvIzs4mJSWF6667DoCXXnqJ0aNHV/g+hw8fZsmSJVx33XVMmzaNadOmMWvWLHJzcykqKqqSvolIxRQEJdx1112kp6fz29/+lp49e9K6dWtSUlLKXbewsLD4YA5Qp04d8vKO/pTCgAEDGD9+PGlpaaW2+8Mf/sD9999fPN+1a1def/119u/fz5IlS7jmmmsoLCyMbsci9Pnnn5OamkrNmjVp1qwZ559/Ptu2bat0u2nTpnH99deTnJzMvn37WLx4McOGDatw/ZdffpmBAwcCkJ+fT/PmzcnPz2fOnDn07ds3av0RkcgoCEoYPHgw48ePJycnh1WrVvHoo49WuG6DBg3YvXt38XxBQQE1ahz943zjjTeYP38+/fr1w90B+Oijj+jQoQO1a9cuXm/kyJFkZGRw9tlns2LFCtq0acNnn31WBb2rXJs2bZg/fz5ff/01GRkZ5ObmctFFF1W6Xcmhrvj4eGbPnl2qjyUdOHCAlStXctVVVwGQmJjIunXrqFevHocOHapwOxGpOgqCY9x888107dqViy++mFtuuaXC9VJTU4vHtffv38+GDRto1KgRGRkZ7NixA4CePXuyc+dO9u/fD8Cf/vQn7r333jL7mjt3LjfffDNmViZgqlObNm1o1qwZN954IwMHDmT48OHUqlXruNtkZmZyzjnn0KJFi4jeY+LEiQwePLh4fuDAgQwdOpQGDRpQUFBA69at2bhx47fphoicIAVBOZ555hlGjx6NmVW4Tr9+/Rg3bhzvvPMOP/vZz+jRowdnnHEGS5Ys4eGHH6awsJB3332X8847j/j4eJYsWULnzp3LPbDm5uYWX4vIz88vNeRUnaZOnUqLFi1YtWoVa9euZd68eaxYseK420ybNo0777wzov3v3buXdevWlbozqnfv3qxfv57ExEQ++OADhg8fzoIFC75VP0TkxJzUT1WezlIeW3jc5QXrV5C3eQ9DlhXBsoUczs9hx5u/4dyBz5dZ95v//Al97h5KzbqNaPSDdFIeW4gfbkfuqveoVb8xcQlJNOr1ICmPLST37bE0un4Yvz7m/Q/mfoUfPMxzjy1kZ348H76xmLm1emAzNh+3nRvH3Hjina/E8uXLadOmDQA1atTgkksuISsri8svv7zc9YuKiliwYAGjRo2KaP8TJkwgPT29TH39+vVceOGFvPXWW7Ro0YIPP/zw5DshIicscEFQmdoXXE7tC44e+OLqJ5UbAgB1LvxP6lz4n6VqFleLpB+WvbaQeNX/wWrULFOv1bh58XTDnkNOttlR0aJFC2bOnEm3bt3YtWsXM2bMYMaMGRWuv3r1apo0aULDhg0r3feuXbvYsWMHrVu3LrNs0aJF3H///UyaNInNmzfH7IxIJKgqDQIzSwBeB2oC+4DbgReBNsBCdx8VXm9yNGvfNXEJZ8e6CZWeDRV9cwH/3lmDSzteidWMo94lvbhzxnp23Dqg3DDMXzGPw55c7n53/z30/MTUA6FluzNeo277a5l7zLpeVMi+z/6X321exD7/HgsfGEGT20fx+0raWhVnRCJBFckZwV3Ac+7+rpm9CNwB1HT3K8zsZTNrBbSLZs3dy/9eBqlSNc6sQ1Lfx8vUKzojqn95xbd6Jna+q9R83fbXEpfQpMx6VqMmddteA0B8m67Et+l6Ik0WkSioNAjc/YUSs0nAj4Gx4fl3gM7ApcCsKNYUBN8x5YWAiJwaIr5ryMyuABoA2cCWcHkn0ASIj3Lt2PceZGZZZpaVk5MTaZNFRCQCEQWBmTUEJgADgb3Akad+6ob3Ee1aKe4+yd1T3T01KSkp0r6JiEgEKg0CM6sFvAE87u6bgJWEhm8AOgAbq6AmIiLVJJKLxfcClwFPmNkTwBSgv5mdC/QCOgEOZESxJiIi1aTSMwJ3f9HdG7h7t/Drz0A3YDnQ3d3z3D0/mrVod1JERCp2Ug+Uufsujt7pUyU1ERGpHvquIRGRgBHAKRIAAAScSURBVFMQiIgEnIJARCTgFAQiIgGnIBARCTgFgYhIwCkIREQCTkEgIhJwCgIRkYBTEIiIBJyCQEQk4BQEIiIBpyAQEQk4BYGISMApCEREAk5BICIScAoCEZGAUxCIiAScgkBEJOAUBCIiARdREJhZEzPLCE+fYWYLzOwDMxtYFTUREak+lQaBmTUA/gzEh0vDgJXufhXQz8zqVUFNRESqSSRnBIXA7UB+eL4bMCs8/T6QWgW1UsxskJllmVlWTk5OBE0WEZFIVRoE7p7v7nklSvHAlvD0TqBJFdSObcMkd09199SkpKTIeiYiIhE5mYvFe4Ha4em64X1EuyYiItXkZA66K4HO4ekOwMYqqImISDWJO4lt/gy8bWZXA22AjwgN7USzJiIi1STiMwJ37xb+7yagJ/ABcK27F0a7Fr3uiYhIZU7mjAB338rRO32qpCYiItVDF2ZFRAJOQSAiEnAKAhGRgFMQiIgEnIJARCTgFAQiIgGnIBARCTgFgYhIwCkIREQCTkEgIhJwCgIRkYBTEIiIBJyCQEQk4BQEIiIBpyAQEQk4BYGISMApCEREAk5BICIScAoCEZGAO2WCwMwmm1mmmT0Z67aIiATJKREEZnYLUNPdrwBamlmrWLdJRCQoTokgALoBs8LT7wCdY9cUEZFgMXePdRsws8nAeHf/p5ldB1zm7mNKLB8EDArPfg9YG4NmnojGQG6sGxEjQe47BLv/Qe47nPr9P9/dk8pbEFfdLanAXqB2eLoux5ypuPskYFJ1N+pkmVmWu6fGuh2xEOS+Q7D7H+S+w+nd/1NlaGglR4eDOgAbY9cUEZFgOVXOCOYBGWZ2LtAL6BTj9oiIBMYpcUbg7vmELhgvB7q7e15sW/StnTbDWFUgyH2HYPc/yH2H07j/p8TFYhERiZ1T4oxA5LvAzBqaWU8zaxzrtoicCAVBlJlZEzPLiHU7qpuZJZjZIjN7x8zmmlmtWLepOplZA+AvQEdgiZmVe5ved1n47/4/Yt2O6mRmcWb2lZktDb/axbpNJ0NBEEXhg8GfgfhYtyUG7gKec/frgG3A9TFuT3VrDzzi7v8F/BW4LMbtiYXfcfQ28KBoD8xw927h179i3aCToSCIrkLgdiA/1g2pbu7+gru/G55NAnbEsj3Vzd2XuftyM+tC6KwgM9Ztqk5m1gPYR+hDQJB0Anqb2cfh70s7Ve7EPCEKgihy9/zvwB1P34qZXQE0cPflsW5LdTMzI/RBYBdwKMbNqTbhYcBfAo/Fui0xsAK41t07AmcAN8S4PSdFQSBRY2YNgQnAwFi3JRY8ZCjwCXBTrNtTjR4DXnD33bFuSAx84u5fh6ezgNPyCzMVBBIV4U+FbwCPu/umWLenupnZo2Y2IDybCATpoHgtMNTMlgKXmNmfYtye6jTNzDqYWU2gL/DPWDfoZOg5gipgZkvdvVus21GdzGwIMJqj/xBedPeZMWxStQrfKDALOBNYAwz1AP7jCtrffTNrC0wHDJjv7k/EuEknRUEgIhJwGhoSEQk4BYGISMApCEREAk5BICIScAoCEZGAUxCIiAScgkBEJOD+Px7uAAsJP2oKAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "x = np.arange(1, 6)\n",
    "nums = [len(dataset[dataset['star'] == i]) for i in x]\n",
    "\n",
    "\n",
    "def plot_score_distribution(x, nums, gap=1000):\n",
    "    plt.bar(x, nums)\n",
    "\n",
    "    plt.title('影评分数的分布')\n",
    "    for x, y in zip(x, nums):\n",
    "        plt.text(x,\n",
    "                 y + gap,\n",
    "                 f\"{(y / sum(nums))*100:.2f}/%\",\n",
    "                 ha='center',\n",
    "                 fontsize=12)\n",
    "    plt.show()\n",
    "\n",
    "\n",
    "plot_score_distribution(x, nums)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T00:58:38.660359Z",
     "start_time": "2020-05-14T00:58:38.529029Z"
    }
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAD2CAYAAADbPoDqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAATL0lEQVR4nO3db4xldX3H8ffXXUimMzougdyWfSCQkDa2wwo7pbu60DsE6FKqtcRWkm0NUTOJog/qPnCtGGOjLQ/EVomYTqVALXWy2goqCPiAKRsKFkYrY/+YaDuYToWt2XXWIduma799cC9hd5m9f8+9d3Z/71dys3d+5/zmfH+/e+79zDnn3ruRmUiSyvWKURcgSRotg0CSCmcQSFLhDAJJKpxBIEmF2zzqArp17rnn5gUXXNBz/xdeeIHx8fHqCqqIdXXHurpjXd05E+taXFz8UWaet+7CzDytbtu3b89+PProo331HxTr6o51dce6unMm1gU8nad4XfXUkCQVziCQpMIZBJJUOINAkgpnEEhS4QwCSSqcQSBJhTMIJKlwBoEkFe60+4oJSRvLBfse6Lnv3qlj3NRj/+Vbr+95uzqRRwSSVDiDQJIKZxBIUuEMAkkqXNsgiIh3RcRC8/aPEfFnEXFnRDwREbcct16lbZKk4WgbBJn5mcysZ2YdOAB8H9iUmTuBiyLi4oi4ocq2AY1VkrSOaPx/BR2sGLEV+BPgOeChzHwwIm4ExoBLq2zLzLtO2vYsMAtQq9W2z8/P9zzgtbU1JiYmeu4/KNbVHevqziDrWlpZ7blvbQyeP9pb36mtkz1vt50z8XGcmZlZzMzp9ZZ18zmCm4HPAL8LrDTbDgGXAeMVt50gM+eAOYDp6ems1+tdlH2ihYUF+uk/KNbVHevqziDr6vVzAND4HMFtS719nGl5T73n7bbTar76+dxEv+7ePTGQx7Gji8UR8QpgBlgA1mj8JQ8w0fwdVbdJkoak0xfdK4BvNP/fy0VgV7N9G7A8gDZJ0pB0ekz2a8Bjzfv3AQci4nzgOmAHkBW3SZKGpKMjgsz8g8z82+b9I0AdeBKYyczVqtuqG54kqZ2ertJk5mFg/yDbJEnD4YVZSSqcQSBJhTMIJKlwBoEkFc4gkKTCGQSSVDiDQJIKZxBIUuEMAkkqnEEgSYUzCCSpcAaBJBXOIJCkwhkEklQ4g0CSCmcQSFLhDAJJKpxBIEmFMwgkqXAdB0FE3BERb2zevzMinoiIW45bXmmbJGk4OgqCiLgC+NnM/EpE3ABsysydwEURcXHVbQMZqSRpXW2DICLOAv4cWI6I3wTqwP7m4keAXQNokyQNSWRm6xUi3gFcD7wbeC+wD7gsM78dEdcClwEXA5+qqi0zbz2phllgFqBWq22fn5/vecBra2tMTEz03H9QrKs71tWdQda1tLLac9/aGDx/tLe+U1sne95uO63mq5/x9uvCyU09P44zMzOLmTm93rLNHfS/FJjLzOci4q+A1wNjzWUTNI4q1ipuO0FmzgFzANPT01mv1zsoe30LCwv0039QrKs71tWdQdZ1074Heu67d+oYty118jL0cst76j1vt51W89XPePt19+7xgTyOnVwj+B5wUfP+NHABL52+2QYsA4sVt0mShqSTKL4T+IuIuBE4i8Y5/S9HxPnAdcAOIIEDFbZJkoak7RFBZv4kM387M6/MzJ2Z+SyNMHgSmMnM1cw8UmVb1YOUJJ1aTyfnMvMwL73TZyBtkqTh8JPFklQ4g0CSCmcQSFLhDAJJKpxBIEmFMwgkqXAGgSQVziCQpMIZBJJUOINAkgpnEEhS4Xr7InBJ61paWR3J99Uv33r90LepM4dHBJJUOINAkgpnEEhS4QwCSSqcQSBJhTMIJKlwBoEkFc4gkKTCtQyCiNgcET+IiIXmbSoiPhIRT0XEp49br9I2SdLwtDsiuAT4fGbWM7MOnA3sAi4HDkbE1RGxvcq2AYxRktRCZOapF0a8G7gZeAFYAr4LrGXmHRGxA7gOWAX+u6q2zPzwOnXMArMAtVpt+/z8fM8DXltbY2Jiouf+g2Jd3dmodR08tMrzR4e/3amtky2XD3K+llZWe+5bG6Pn+Wo35n60mq9+xtuvCyc39fw4zszMLGbm9HrL2n3X0FPA1Zn5w4j4S2CMRhgAHAJqwDHg+xW2vUxmzgFzANPT01mv19uUfWoLCwv0039QrKs7G7Wu2++9n9uWhv8VXst76i2XD3K++vlupb1Tx3qer3Zj7ker+RrFd0m96O7d4wN5HNs9As9k5v807z8NnEUjDAAmaJxaWqu4TZI0RO1eeD8XEdsiYhPwZmCcxjl9gG3AMrBYcZskaYjaHRH8IfDXQABfBj4KHIiITwK7m7dngT+usE2SNEQtjwgy8zuZeUlmTmXmBzPz/4CrgQM0Luz+e9VtAxupJGldXV+lycyjwBcH2SZJGh4vzkpS4QwCSSqcQSBJhTMIJKlwBoEkFc4gkKTCGQSSVDiDQJIKZxBIUuEMAkkqnEEgSYUzCCSpcAaBJBXOIJCkwhkEklQ4g0CSCmcQSFLhDAJJKpxBIEmFMwgkqXAdBUFE1CLiW837d0bEExFxy3HLK22TJA1Pp0cEHwfGIuIGYFNm7gQuioiLq26rfoiSpFYiM1uvEHEV8DvALwDPAA9l5oMRcSMwBlxaZVtm3rVODbPALECtVts+Pz/f84DX1taYmJjouf+gWFd3NmpdBw+t8vzR4W93autky+WDnK+lldWe+9bG6Hm+2o25H63mq5/x9uvCyU09P44zMzOLmTm93rLNrTpGxNnAh4DfAu4DxoGV5uJDwGUDaHuZzJwD5gCmp6ezXq+3KrulhYUF+uk/KNbVnY1a1+333s9tSy2fVgOxvKfecvkg5+umfQ/03Hfv1LGe56vdmPvRar76GW+/7t49PpDHsd2poX3AHZn54+bPazT+kgeYaPavuk2SNETtXnivBm6OiAXgdcAbgV3NZduAZWCx4jZJ0hC1PCbLzCtfvN8MgzcBByLifOA6YAeQFbdJkoao41MxmVnPzCNAHXgSmMnM1arbqhqYJKkzXV+lyczDwP5BtkmShseLs5JUOINAkgpnEEhS4QwCSSqcQSBJhTMIJKlwBoEkFc4gkKTCGQSSVDiDQJIKZxBIUuEMAkkqnEEgSYUzCCSpcAaBJBXOIJCkwhkEklQ4g0CSCtdREETEORFxTUScO+iCJEnD1TYIImIL8FXgcuDRiDgvIu6MiCci4pbj1qu0TZI0HJ0cEVwCvC8zPwY8DFwFbMrMncBFEXFxRNxQZdsgBipJWl9kZmcrRlwJfBT4HvDFzHwwIm4ExoBLgYeqasvMu07a9iwwC1Cr1bbPz8/3POC1tTUmJiZ67j8o1tWdjVrXwUOrPH90+Nud2jrZcvkg52tpZbXnvrUxep6vdmPuR6v56me8/bpwclPPj+PMzMxiZk6vt2xzJ78gIgJ4K3AYSGCluegQcBkwXnHbCTJzDpgDmJ6eznq93knZ61pYWKCf/oNiXd3ZqHXdfu/93LbU0dOqUst76i2XD3K+btr3QM99904d63m+2o25H63mq5/x9uvu3eMDeRw7ulicDTcDzwCvp/GXPMBE83esVdwmSRqSTi4Wvz8i3tb88dXArcCu5s/bgGVgseI2SdKQdHJMNgfsj4h3At8B7gMei4jzgeuAHTROFx2osE2SNCRtjwgy83BmXpOZV2bmuzNzFagDTwIzmbmamUeqbKt6kJKkU+vpKk1mHgb2D7JNkjQcXpiVpMIZBJJUOINAkgpnEEhS4QwCSSqcQSBJhTMIJKlwBoEkFc4gkKTCGQSSVDiDQJIKZxBIUuEMAkkqnEEgSYUzCCSpcAaBJBXOIJCkwvX0P5SdzpZWVrlp3wND3+7yrdcPfZuS1AmPCCSpcMUdEZTmgj6PfvZOHev5CMqjIOn00PaIICImI+JrEfFIRHwpIs6OiDsj4omIuOW49SptkyQNRyenhvYAn8jMa4HngBuBTZm5E7goIi6OiBuqbBvEQCVJ64vM7HzliC8CrwL+NDMfjIgbgTHgUuChqtoy866TtjsLzALUarXt8/PzPQ/44KFVnj/ac/eeTW2dbLl8bW2NiYmJyre7tLLaV//aGD3PV7sx92NQ89Wv0vYv6G8fOx33r36fU/24cHJTz4/jzMzMYmZOr7es42sEEbET2AIsAyvN5kPAZcB4xW0nyMw5YA5geno66/V6p2W/zO333s9tS8O/NLK8p95y+cLCAv2M61T6fYfU3qljPc9XuzH3Y1Dz1a/S9i/obx87HfevUbzr8EV37x4fyOPY0buGIuIc4Hbg7cAajb/kASaav6PqNknSkHRysfhs4AvABzLzWWAR2NVcvI3GEULVbZKkIenkmOwdNE7XfDAiPgjcBfxeRJwPXAfsABI4UGGbJGlI2h4RZOZnMnNLZtabt3uAOvAkMJOZq5l5pMq2qgcpSTq1nq7SZOZhYP8g2yRJw+GFWUkqnEEgSYUzCCSpcAaBJBXOIJCkwhkEklQ4g0CSCmcQSFLhDAJJKpxBIEmFMwgkqXAGgSQVziCQpMIZBJJUOINAkgpnEEhS4QwCSSqcQSBJhTMIJKlwHQVBRNQi4kDz/lkR8ZWIeDwi3j6INknS8LQNgojYAtwDjDeb3gssZuYbgLdExCsH0CZJGpLIzNYrRLwKCOD+zKxHxJeBfZn5zxGxD/gG8PtVtmXmoyfVMAvMAtRqte3z8/M9D/jgoVWeP9pz955NbZ1suXxtbY2JiYnKt7u0stpX/9oYPc9XuzH3Y1Dz1a/S9i/obx87Hfevfp9T/bhwclPPj+PMzMxiZk6vt2xzu86ZeQQgIl5sGgdWmvcPAbUBtJ1cwxwwBzA9PZ31er1d2ad0+733c9tS22FXbnlPveXyhYUF+hnXqdy074G++u+dOtbzfLUbcz8GNV/9Km3/gv72sdNx/+r3OdWPu3ePD+Rx7OVi8Row1rw/0fwdVbdJkoaklxfdRWBX8/42YHkAbZKkIenlmOwe4MGIuAJ4LY3z/CsVt0mShqTjI4LMrDf/fRa4BngcuDozf1p1W3XDkyS109NVmsz8T2D/INskScPhhVlJKpxBIEmFMwgkqXAGgSQVziCQpMIZBJJUOINAkgpnEEhS4QwCSSqcQSBJhTMIJKlwBoEkFc4gkKTCGQSSVDiDQJIKZxBIUuEMAkkqnEEgSYUzCCSpcAaBJBVuwwRBRNwZEU9ExC2jrkWSSrIhgiAibgA2ZeZO4KKIuHjUNUlSKSIzR10DEfEp4KHMfDAibgTGMvOu45bPArPNH38e+G4fmzsX+FEf/QfFurpjXd2xru6ciXW9JjPPW2/B5t7rqdQ4sNK8fwi47PiFmTkHzFWxoYh4OjOnq/hdVbKu7lhXd6yrO6XVtSFODQFrwFjz/gQbpy5JOuNtlBfcRWBX8/42YHl0pUhSWTbKqaH7gAMRcT5wHbBjgNuq5BTTAFhXd6yrO9bVnaLq2hAXiwEiYgtwDfBYZj436nokqRQbJggkSaOxUa4RSKeViDgnIq6JiHNHXcvxNmpd2tjO2CCIiFpEHGix/KyI+EpEPB4Rb99AdW2NiP+IiIXmbd33/VZYz2REfC0iHomIL0XE2adYb6if/O6krojYHBE/OG6upoZU2xbgq8DlwKOneoxGMGdt6xrhnNUi4lstlo/kmwVa1TXCuepouxHxkYh4KiI+3e82z8ggaD4h7qHx+YRTeS+wmJlvAN4SEa/cIHX9CvCxzKw3b/814LL2AJ/IzGuB54DdJ68wok9+t60LuAT4/HFztTSEul7c7vsy82PAw5z0uRcY2Zy1rYvRzdnHeekt4icY8TcLnLIuRrt/tdxuRGyn8U7Ly4GDEXF1Pxs8I4MA+CnwVuBIi3XqwP7m/ceAYXx4pJO6dgDvjIhvRsQfDbqgzLwjM7/e/PE84OA6q9V5aa4e4aW3+o66rh3Ab0TEPzT/ohzKu+Ay8+8y88mIuJLGE/GJdVarM/w566Suoc9ZRFwFvEAj0NdTZ8hzBR3VNZL9q8Pt/irwN9m4yPswcEU/GzwjgyAzj2TmapvVTv40c22wVXVc19doPDF+GdgZEZcMui6AiNgJbMnMJ9dZPPS56rCup4CrM/Ny4Czg14dYV9AI9cPA/66zykjmrIO6hjpnzVN6HwL2tVht6HPVYV2j2r862W6lc3ZGBkGHNuqnmf8+M3+SmT8FvgUM/DA5Is4BbgdOda1kJHPVQV3PZOYPm/efZghz9aJsuBl4BnjTOquMZM46qGvYc7YPuCMzf9xinVHMVSd1jWr/6mS7lc7ZRnnxG4WN+mnmhyPi5yLiZ4Brge8McmPNv4y+AHwgM589xWpDn6sO6/pcRGyLiE3Am4FvD7quZm3vj4i3NX98NbDei8ko5qyTuoY9Z1cDN0fEAvC6iPjsOuuM4rnYSV0j2b863G61c5aZZ+wNWGj+exXwnpOWvQb4J+CTNA7FNm2QumaAf6XxF917hlDLu2icRlho3j4MfPSkdV7V3Bk/AfwLMLlB6vql5jwt0bjAPqzHbwvwdRrXlu4AfnGDzFkndY1kzprbXgBeuxHmqsO6RrV/nbBd4Bzgsyet8wrg8ebr13eBC/vZZtEfKIvGV1rsAh7O9ufuixZ+8rtrzlnnnKvuRcQYcD3wzcz8t75+V8lBIEkq+xqBJAmDQJKKZxBIUuEMAkkqnEEgSYX7fwfEJgRrdKiJAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset['star'].hist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 文本预处理\n",
    "- 删除符号\n",
    "- 繁体转简体\n",
    "- 分词后以空格连接\n",
    "- 英文大写转小写"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:04:33.816343Z",
     "start_time": "2020-05-14T00:59:47.238147Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Dumping model to file cache /tmp/jieba.cache\n",
      "Loading model cost 0.411 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0                         吴京 意淫 到 了 脑残 的 地步 看 了 恶心 想 吐\n",
       "1    首映礼 看 的 太 恐怖 了 这个 电影 不讲道理 的 完全 就是 吴京 在 实现 他 这个...\n",
       "2    吴京 的 炒作 水平 不输 冯小刚 但小刚 至少 不会 用 主旋律 来 炒作 吴京 让 人 ...\n",
       "3                     凭良心说 好 看到 不像 战狼 1 的 续集 完虐 湄公河 行动\n",
       "4                                                中二得 很\n",
       "5                   犯 我 中华 者 虽远必 诛 吴京 比 这句 话 还要 意淫 一百倍\n",
       "6                            脑子 是 个 好 东西 希望 编剧 们 都 能 有\n",
       "7    三星 半 实打实 的 7 分 第一集 在 爱国 主旋律 内部 做 着 各种 置换 与 较劲 ...\n",
       "8    开篇 长镜头 惊险 大气 引人入胜 结合 了 水平 不俗 的 快 剪下 实打实 的 真刀真枪...\n",
       "9    15 100 吴京 的 冷峰 在 这部 里 即 像 成龙 又 像杰 森斯坦 森 但 体制 外...\n",
       "Name: cleaned_comment, dtype: object"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "punct = r\"，。！？、；：“”\\n＂＃＄％＆＇（）＊＋－／＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〟〰〾〿–—‛„‟…‧﹏★☆•→▽\"\n",
    "\n",
    "\n",
    "def clean_special_chars(text):\n",
    "    re_tok = re.compile(f'([{string.punctuation}{punct}])')\n",
    "    return re_tok.sub(r' ', text)\n",
    "\n",
    "\n",
    "def simplify(text):\n",
    "    return zhconv.convert(text, 'zh-cn')\n",
    "\n",
    "\n",
    "def cut_join(text):\n",
    "    space = ' '\n",
    "    words = jieba.cut(text)\n",
    "    return space.join([w.lower() for w in words if not w.isspace()])\n",
    "\n",
    "\n",
    "def preprocess(text):\n",
    "    text = clean_special_chars(text)\n",
    "    text = simplify(text)\n",
    "    text = cut_join(text)\n",
    "    return text\n",
    "\n",
    "\n",
    "dataset['cleaned_comment'] = dataset['comment'].apply(preprocess)\n",
    "\n",
    "dataset['cleaned_comment'].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:16:37.141254Z",
     "start_time": "2020-05-14T01:16:36.385260Z"
    }
   },
   "outputs": [],
   "source": [
    "# # 保存处理好的数据\n",
    "# with open('../datasets/douban_comments.txt', 'w') as f:\n",
    "#     for comment in dataset['cleaned_comment']:\n",
    "#         f.write(comment + '\\n')\n",
    "\n",
    "# with open('../datasets/douban_comment_stars.txt', 'w') as f:\n",
    "#     for star in dataset['star']:\n",
    "#         f.write(comment + '\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 查看影评的字数分布\n",
    "> **如何更好的处理不同影评的这种字数差别？**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:20:00.934179Z",
     "start_time": "2020-05-14T01:20:00.565789Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "min length:1, max length:4247, median length:23.0, mean length:37.4620741237351, mode length:10\n"
     ]
    }
   ],
   "source": [
    "dataset['comment_length'] = dataset['comment'].apply(len).astype('int')\n",
    "\n",
    "min_ = dataset['comment_length'].min()\n",
    "max_ = dataset['comment_length'].max()\n",
    "median = dataset['comment_length'].median()\n",
    "mean = dataset['comment_length'].mean()\n",
    "mode = dataset['comment_length'].mode()[0]\n",
    "print(\n",
    "    f\"min length:{min_}, max length:{max_}, median length:{median}, mean length:{mean}, mode length:{mode}\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:20:33.720597Z",
     "start_time": "2020-05-14T01:20:25.790101Z"
    }
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlwAAAHxCAYAAAC4dmIdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nOzde3xV1Z3w/89Kwh1EQK4iclWuokK9thSteOlIbUWQTi/otIPtb6Z9+uvMOPrUYmS09efTX8dpZ6atM9XS6bRKpNPWOqK21mq9tMJ4CQFFbiqXoHIVBSXJev7YJxhjQg7hbHZO8nm/XvuVk33WXue7w+3LWmt/V4gxIkmSpPSUZB2AJElSe2fCJUmSlDITLkmSpJSZcEmSJKXMhEuSJCllJlySJEkpM+GSVJRCCCHrGA5Ha+IPIRwXQrguhDA4jZgkpceES1JBhBC6hBCGNTo3P4QwtMH3PUIInZq4NuSu79bEe+eFEB4PIfRr9NaXQwj3hRA6H0KM54UQljYVQxNtR4UQrs29vj2E8O+5OP8lhDAohHBiCKEqhDAiz88uDyHMyL0eCmwMIZyQb+w5lwLXAS3GL6ltMeGS1KwQwlEhhBhC2BhC2NDgeDWE8EKj5jcBj4cQ+uSuLQUuAx4KIfTPtdkEvJPr88AB1AH7gPuaCGMi0DvGuK3R+XOAN2OM7xzCLVUBE3KxtmQbMCeE8BXgHWA/8HFgGvAqcDLQG3ippY5CCKNJEqWhADHGjblY/uEg1/TKJaEh930AvgAsijG+3KhtCCGUhhC653FfkjIQrDQvqTm5Eae3gDdIkqJ6nYDnY4xTcu3mAbcA58YYqxpd/xCwKcZ4WW4qbD9QS5JIPQIMz/VfChBjfK1RDP8GvBpj/FqDc0cDW0gSlscbNH8lxrg212Yu8LNDvOX/L8Z4Te76c4DJwC6SkaUILAe6AXcB85Nw4/+Ta18CdIkx7m3caQjhR8BUYHKMsTZ3bgrwJ+DTMcb3xZlLRFujT4xxZyuvlZSSsqwDkNSm1Y+CnxRj3FB/MoTwcWBB7vUXgYXAeQ2TLYAY495c2/rk4TLgyRjjUyGEN3LndsUYd4YQbgY2A9/J9XsdDUaAQgj/O/dyAvBhIJAkPfNz548B/hW4Ovf928BLMcbh+dxoCOEXuWvqzQTGAyOAE4B1ufsoA14GZgCjcvdf71mSka+G/Z4PzAMuqk+2cj+b5SGEbwP/FkJYF2P8Y6OQhpEkp++QJLhPAT8HyoHTgY0kI4aN7crnfiUdWU4pSjqYEpJ/wJ8JIeysP4AfAXUhhJOAq4DpMcZnQwgnhRCODiH0CyGMCSEMJxkRqp8OnAp8s/GHhBCOAv4X0KXB6b3AY0Cf3DE8d34/8NfADTHG4fUH8F8kyUm93cCTIYSeIYT+ubiaO7oAfwTW1l8cY/wq8CXgKGAV8CjwCvAJkuTneKBnjDEAXwYeBE5rdF/HALcDP4kxLm3i53stySjfb0MIlzV8I8b4SoyxOsa4HfgiSdK1IDd6dStwdoxxZxOH0xZSG+QIl6QmhRDKgBqgX4yxNoRwLnAHMDL3fSnQFTg1xlgXQjgWuB/4d5IE6x8bdDeUZDTmFmBFCOE03pscfY4kwfpeg3O1QE399FiDh/r+EugLfLdRyJ1oMEIVY/wtSSJzRS7ug/l/Y4zvSQRDCJcA/wZ8AxhN8vflCbm+duS+7w68CQwimfZ8p8H1PYBfkqxN+2oIITROhmKMNSGET5AksBUhhMXAV2KMWxr08xHga8AlMcbdudP7c4ekIuEIl6Tm/BnJ6Nau3KjWz0kWiW/Lfb8rdwwLIfQkSS5+STLV+C8kCVD9E3zvAOSmHO8Hjm30WWcA/xxj3JNHXItJpiavzyWB9TqRJDeN/SfQOcYYGh4kidtu4PvATxpekBtxu4pk1O2HJGvMngU+CTwB/AVJ0jU5d8lAYH2jz/0pSYJ2C8ki+7rGDwvk1mntA14A/g4YBxz4GYQQTsndbxlwb4NrJgG3N+rrwTx+dpIy4giXpCbFGH8ZQuhFkgg0ZXuM8V9DUgriEeAPwP/KjeLsh/eMSu3PTa8dQzI9FknWR0GyVunvgdJcmYRtDZ5I/HATi8dfjzFuCCFcSfLk30O585157xqs+vt4z0hQCGE8yejYMJJ1VY83cc3uEMLs3H3MIUmwRscY3wghLAd+RZJsfgj4DUmi9NtG3XyJZCp0Hcl0ZR3wNMn040MN2j0OvBxjvD2E8J36UbIQwodIEtjfAx8B/pzkZwzJ9Ob3SJI6gGtIHkKQ1EaZcEk6mFKShetXAFsbnD8T+FwI4SGSZOsbMcZbAUIIU2OMy5ro61Mkoz1vkyQfpbnzj5IkYCUkfyddC/xT7r3HgItzr3sDGxr0912StWUnxxifIVn/dWCEq0EpisZTb58mGZk6GdiTe+IRkkX4XUmmBmtJpkB7N7hudy6B3AkMAD4GfD2E8E3gFN77tCS50g315RueDSGMyd3zQzHG53MxdgKOJlkAT4NkawpJUnYPMJdkhGxPg+nVWuCtBt/vI5n+ldRGmXBJOpj6pOi7vL8sxEsxxudDCKNijG8A5NZL/SCEML6+PEMD/x5jrE+kCCGcTDLic/xByhg0tYYLgNwi/WdIEpJnSBbnN5ySrCB5mrE5rzRzfgRJYjcEOI6kdMMJMcatIYTvA2Uxxv0hhP8GbgNuBjbGGFuqx3Uh8DrJ9GG9wSSJ3ntiyT3B+GfAb3Pr5VroWlJbZ8IlqVm5KbQBwHagF8m6pVNIRlvW1LcBCCFcTjLNNa+JZGs4sCSE8Gf1ozsFciVJAVGAniRrsupdBNQ2LoyaK2T6lcblInIPAfQhuUdijG+FEK7J9XllCGE1ySjdqbn394YQbiOZzvubgwWZW0D/VZKnFRtOkQ7Jfd3Y+JoY4wMNvm1pvW1n3i29IakNctG8pGblFo8/BPxtg9NlwAO5OlmEEEpyNbJ+CMyNMd7ZoG39FjRLSaYH1xxiCGX1pRt47/QeADHGZxqs0epLgxpUMca9h1KFPsZYG2N8vWGtLJIaX58iecpyCUli89UQwsBcglafMI0KzQxD5R4oWEIy5dm4wv0oYHd90noQXZo6GUKYGEK4n6RMxoYW+pCUIRMuSU3KLXL/PcmU2j8CY3NvvUWygLtnCGESyRquvwTOiTH+slE3n899/SnJyNehrDPqBJxNMuK0g3cTiqb2YhxM8uRjU4VA39f8EGIg1++fAf8/cBLJtOXxwMMkTyHWL2j/cS65ahjXR0gWuk8kqVX2eu78tBDCV4H/DTS13q2xe0mmI+uV5O5jJcm6tX8CbjjE+5J0BDmlKKk524EbY4xLQghTScohPAasjTG+nRvQeRq4E7i4mXVYvwDqYozXNvFefeLU3H/8ugC/jzFOhwOjbc/RIGEKISwgqfg+EfiXGOO65m4mhPBhYDrwUd479dhc++tIykJUAZ+JMf4hd/51kkT034Grc1OLF5KMYv2EZL/F+oXvvyJ5enFmjLHhOq23SZKtx2hhOhIgxnhpo1OlJGvJ6oBLWrpeUvbcS1FSXkIII2KM6xudm9B4O59D6O+DJE8oHhtj3NzKPsaTFB19qqVpuVyx1buAF4HvxhjvaaH9SKB7jHFFo/ODgGOaON+PZG/F7Q3ODYuNNpouhBDCGuDWGOM/F7pvSekw4ZIkSUpZXmu4Qgg/DCE8Ub9I9lDa5BaXPn0ofUmSJLUnLSZcIYRLgdIY45nAyFzxvkNp8y2S+jh59SVJktTe5LNofjrJXl4ADwAfJFkD0WKb3D5nbwLV+fYVQphP8ig2PXr0mDJ27FiKxlu5h4i6H5NtHCmq3bEDgNI+fTKOJBs79iX336drx7x/SVLzli9f/nqMsX9T7+WTcPXg3Uett5Mr+tdSmxBCZ+DrwCdInlTKq68Y420k1ZuZOnVqXLYsnyemJUmSshVCaHbHiXzWcO0hNyVIUsm5qWuaanMN8K+NHhXPpy9JkqR2JZ+EZznJ1B8kG75uyLPNecBfhRAeBk4OIfx7nn0Vr2V3JEc7tuOuxey4a3HLDdupitUVVKyuyDoMSVKRabEsRK7Y4KMkxfsuItkodnaM8bqDtDkjxrirwfsPxxint9SusaKbUizP7TxS3uwtFb1VY8cBMO75VRlHko1JiyYBUDmvMuNIJEltTQhheYxxalPvtbiGK8a4O4QwnaSa8y0xxmrg2Rba7Gr0/vR82kmS1NHt37+fjRs3sm/fvqxDUTO6du3K0KFD6dTpfTuNNSuvrX1ijDt49+nCVrc5lHaSJHVEGzdupFevXgwfPpxm9kRXhmKMbNu2jY0bNzJixIi8r3PRuiRJbci+ffvo16+fyVYbFUKgX79+hzwCacIlSVIbY7LVtrXm18eES5IkKWUmXJIk6bBcccUVbNiwIZW+V61axSWXXHLg+507dzJt2jTOPvts7rvvvmava6pdvtemIa9F88pTOy4HUa+jloOoZzkISTp8l1xyCbt2vftv5p//+Z8zf/7897Vbu3Ytf/d3f8eePXsOnFuwYAF/8Rd/wWc+8xnOO+88Lrzwwian+Jpql++1aTDhkiSpLauv8diUi2+FqVcmr5fdAb/+ykH6yW9QYMqUKQwYMIDOnTtTXV3NlVdeyZw5c5g3bx47d+5kypQp3Hrrraxfv55PfepTdO/end27dwOwdetWrrjiCnbt2sXMmTO59tprm/yMX/7yl3nF0qtXL5YsWcIFF1xw4NwjjzzCN77xDUpLSznxxBPZsGFDk08LNtUu32vT4JSiJEk64K233qKiooLnnnuOn/70p/zxj3/kG9/4BnPnzuXRRx9l165dLF26lFtuuYWrr76apUuX8sYbbwDwzW9+k8svv5zHH3+cX/ziF2zbtu2wYhkwYABdunR5z7mysjJ69uwJQN++fdm6dWuT1zbVLt9r0+AIVyH9YFry9apHso0jResvnQXAiJ8vyTiSbMy5Zw4Ai2daSk7SEZLvcpWpV7472nUYBg4cSM+ePTn++OMpLS0lxsjKlSv5whe+AMDpp5/OqlWrWL9+PZMnT6asrIyTTz4ZgBdeeIEnnniCH/3oR7z55pts3ryZfv36ve8z8p1SbEppaemB13v27KGuri7vdvlemwZHuAppy7PJ0Y7tW7mSfStXZh1GZlZtX8Wq7R17HZukjmfChAk8+eSTADz55JNMmDCBYcOGUVVVRW1tLZWVyfrWE088kZtvvpmHH36Ya665hr59+zbZ3y9/+UsefvjhA0e+yVZ9LPXb/j377LMcf/zxebfL99o0OMIlSZIO6tprr+Wzn/0s3//+95k6dSrnn38+I0eO5NOf/jTf/va36dy5MwDXXHMNn/vc57juuusYMWIEc+fOLXgsX/ziF/nc5z7H6aefTq9evTj22GP56U9/SufOnbnssssO2q6pc0dKi5tXZ8nNq9seN69282pJ6Vq1ahXjxo3LOow2bc2aNTzzzDPMnDnzfWu8WmqX77UtaerX6bA2r5YkSWpLRo8ezejRo1vVLt9rC801XJIkSSkz4ZIkSUqZU4qFdOq8rCNI3dGzZ2cdQqZmjZmVdQiSpCJkwlVIH/tO1hGkbvA/LMw6hEyVn1WedQiS1KFs2bKFqqqqA08WFiunFCVJUpu0evVqLr/8ch577DE+/OEP884773D99dczffp0pk+fztixY/nmN7950D5WrFjBjBkz3nNu5syZPPPMM2mG/j6OcBXS5qeTr0NOyTaOFO1dUQVAt4kTMo4kG1Xbkvuf0K9j3r8kFUK+leafe+457rjjDkaNGkVlZSXr16/nhhtuOPD+ZZddxmc/+9lmPyfGyFe/+lX2799/4Nx//ud/MmrUqAPV8Y8UE65Cum168rUd1+HakCsq11HrcM39dVLEzzpcko6U+vp/TVlw5gJmn5Csra1YXcHCJ5pf9pHv31tNbV79iU984n2bUm/evJk5c+YQQmDatGncdNNNXHHFFYwcOZIHH3yQ2tpafvvb39KtW7f3fUa+m1dfdtll1NTUcO+997Jjx473lHN46qmnGDp06EGLl95xxx2cc8453H///QBs376dv/mbv+GLX/wiv/vd7zjnnHPyiqMQnFKUJEkHNLV5dVObUm/atImbb76Z++67j3vuuefA9Xv27OHRRx9l7NixPP3004cdz549e1i8eDHHH388IYQD5//pn/6JL33pS81et23bNn7yk5/wt3/7twfO/eM//iOzZ8/mqquu4sc//jG/+tWvDju+fDnCJUlSG5bvyNTsE2YfGO06HE1tXt3UptRlZWXccMMN9OzZkzfeeOPA9fPmJU/sDxs2jHfeeafJzziUzauPPvpoFi1axGc+8xmeeuopTj/9dHbu3Mmrr77KqFGjmr2Pa665hm9+85t06tTpwLmnn36ab33rWwwaNIg5c+bw4IMP8rGPfeyQfj6tZcIlSZIO6sQTT+SSSy7hnHPO4Sc/+Ql9+/blmmuu4dprr2Xy5MmcdNJJB9r26NGjxf7ynVL84he/yCc/+UmmTZvGzp07Ofroow9c/9GPfvSg1/7+97/nxRdfBOCZZ57huuuuY/To0axbt46xY8eybNkyN6+WJEltR1ObUl988cV84QtfoH///nTv3p1NmzYV/HOvvvpqPvOZzxBC4Pzzz+fEE08E4P7773/PVOFDDz3EypUr+eu//usD51avXn3g9fTp07nxxhvZvHkzn//857npppvo3r07P//5zwsec3PcvLqQ3Ly63XPzaklpc/Pq4nCom1e7aF6SJCllTikW0vyHs44gdcPvvjvrEDJ158V3Zh2CJKkImXAVUjsueFqvoxY8rWfBU0lSazilKEmSlDITrkL61ZeTox3b8vUFbPn6gqzDyEz54+WUP16edRiSpDxs2bKF3/zmN++pE5YVE65C+p9FydGO7ayoYGdFRdZhZGbJi0tY8uKSrMOQpDbliiuuYMOGDQBUV1dz8803H9HPf/nll5k+fTrnnnsu8+fPJ8bY5MbXTdm/fz8zZ87k7LPP5vbbb2/23OEy4ZIkSQUzaNAgrrnmmoL0dckllzB9+vQDx2233dZkux/84Ad873vf46GHHuKVV16hsrLywMbX119/PSNHjmT9+vVNXvvd736XKVOm8Nhjj3H33XfzxhtvNHnucLloXpKkNqy+/mFTBt1wA30unwPAjrsWU3399c22zbd+Yr6bV69fv55PfepTdO/end27dx+4fsOGDZSXl/OjH/0I4Ihscn3TTTcdeL1t2zaOOeYYTjrppGY3vm7o4YcfPjAiN23aNJYtW9bkucPd6NoRLkmSdEC+m1ffcsstXH311SxduvSgI0BHapNrgLvuuosJEyYwZMiQA5/R1MbXDb355psce+yxAPTt25etW7c2ee5wOcIlSVIblu/IVJ/L5xwY7Toc+W5evX79eiZPnkxZWRknn3xys/0dqU2u161bx7e+9S1+85vfHDjX1MbXjfXs2ZO9e/fSu3dv9uzZQ8+ePZs8d7hMuCRJ0kE1tXn1sGHDqKqqYtiwYVRWNr/d2be//e3UN7nesWMHn/zkJ7n99tvp3TvZZq+5ja8bmzJlCn/4wx+47LLLePbZZznjjDOaPHe4TLgKafDkrCNIXdfx47MOIVPj+rq/maSOp6nNq6+++mo+/elP8+1vf5vOnTs3e+2R2OT65ptv5uWXX+ZLX/oSADfccEOTG183tcn1vHnz+OhHP8qjjz7KypUrOf300zn22GPfd+5wuXm1JEltiJtXH3mbN2/mD3/4AxdccMGBEbKmzjV0qJtXO8IlSZI6tCFDhjBnzpwWzx0On1KUJKmNacuzT2rdr48JVyGV906OdmzV2HEHrQnT3k1aNIlJiyZlHYakdqxr165s27bNpKuNijGybds2unbtekjXOaUoSVIbMnToUDZu3Mhrr72WdShqRteuXRk6dOghXWPCJUlSG9KpUydGjBiRdRgqMKcUJUmSUmbCJUmSlLJUE64QQt8QwowQwjFpfo4kSVJbllfCFUL4YQjhiRDCdfm2CSH0AX4NnAb8LoTQP4RQFkJ4OYTwcO7wcS9JktTutbhoPoRwKVAaYzwzhHB7CGFMjPHFltoAQ4CvxhifzCVfpwKvAT+LMf59CveSvYtvzTqC1A264YasQ8jUgjMXZB2CJKkI5fOU4nRgce71A8AHgRdbahNjvAMghDCNZJRrIfBp4OIQwjlAJXBVjLHmMOJvW6ZemXUEqSvETvTFbPYJs7MOQZJUhPKZUuwB1O80uR0YmG+bEEIALgd2APuBp4DzYoynAZ2AjzbuKIQwP4SwLISwzBokkiSpPcgn4doDdMu97tnMNU22iYm/Ap4DPgY8F2Pckmu3DBjTuKMY420xxqkxxqn9+/fP+0bahGV3JEc7tuOuxey4a3HLDdupitUVVKyuyDoMSVKRySfhWk4yjQgwGdiQT5sQwt+HED6bO3c0sBP4jxDC5BBCKfBx4NnWBt4m/forydGOVV9/PdXXX591GJlZ+MRCFj6xMOswJElFJp81XL8AHg0hDAEuAuaGEG6MMV53kDZnkCRzi0MInwdWkKzt2gT8FAjAr2KMvyncrUiSJLVNLSZcMcbdIYTpwAzglhhjNY1Gppposyv31oxG3a0ATjrcoCVJkopJXnspxhh38O5TiK1uI0mS1BG5tY8kSVLKTLgkSZJSZsIlSZKUshBjzDqGZk2dOjUuW7Ys6zAkSZJaFEJYHmOc2tR7jnBJkiSlzIRLkiQpZSZchfSDacnRjq2/dBbrL52VdRiZmXPPHObc07E38JYkHbq86nApT1va105FTdm3cmXWIWRq1fZVWYcgSSpCjnBJkiSlzIRLkiQpZSZckiRJKTPhkiRJSpkJlyRJUsp8SrGQTp2XdQSpO3r27KxDyNSsMR23JIYkqfXc2keSJKkA3NpHkiQpQyZchbT56eRox/auqGLviqqsw8hM1bYqqrZ13PuXJLWOa7gK6bbpydfyXZmGkaYNl10GwLjnO2bF9bm/ngtA5bzKjCORJBUTR7gkSZJSZsIlSZKUMhMuSZKklJlwSZIkpcyES5IkKWUmXJIkSSmzLEQhzX846whSN/zuu7MOIVN3Xnxn1iFIkoqQCVchDTkl6whS123ihKxDyNSEfh37/iVJreOUoiRJUspMuArpV19OjnZsy9cXsOXrC7IOIzPlj5dT/nh51mFIkoqMCVch/c+i5GjHdlZUsLOiIuswMrPkxSUseXFJ1mFIkoqMCZckSVLKTLgkSZJSZsIlSZKUMhMuSZKklJlwSZIkpczCp4U0eHLWEaSu6/jxWYeQqXF9x2UdgiSpCIUYY9YxNGvq1Klx2bJlWYchSZLUohDC8hjj1Kbec0pRkiQpZSZckiRJKTPhKqTy3snRjq0aO45VYzvuOqZJiyYxadGkrMOQJBUZEy5JkqSUmXBJkiSlzIRLkiQpZSZckiRJKTPhkiRJSpkJlyRJUspS3donhNAXmAI8HWN8Pc3PahMuvjXrCFI36IYbsg4hUwvOXJB1CJKkIpTX1j4hhB8C44F7Y4w35tMmhNAHuDd3zAXOjTG+lk9f9dzaR5IkFYvD2tonhHApUBpjPBMYGUIYk2ebk4CvxhhvAu4HTs2nL0mSpPYmnzVc04HFudcPAB/Mp02M8fcxxidDCNOA04An8ukrhDA/hLAshLDstddey/M22ohldyRHO7bjrsXsuGtxyw3bqYrVFVSsrsg6DElSkclnDVcPYFPu9Xbg1HzbhBACcDmwA9ifT18xxtuA2yCZUsznJtqMX38l+Tr1ymzjSFH19dcD0OfyORlHko2FTywEYPYJszOORJJUTPIZ4doDdMu97tnMNU22iYm/Ap4DPpZnX5IkSe1KPgnPct6d+psMbMinTQjh70MIn82dOxrYmWdfkiRJ7Uo+U4q/AB4NIQwBLgLmhhBujDFed5A2Z5Akc4tDCJ8HVpCs2erVRLt2p7y86deSJKljajHhijHuDiFMB2YAt8QYq4FnW2izK/fWjEbdNddOkiSp3cqr8GmMcQfvPl3Y6jaH0k6SJKm9cNG6JElSylLd2qfDKd/V7tdsjXt+VdYhZKpyXmXWIUiSipAjXJIkSSlzhOsQNB69au+jWZIkqTBMuA5D44RrPtOYD9zGI1mEc0Ssv3QWACN+viTjSLIx556kwv7imT73IUnKX4dMuJoamSrEaNWQ91bLaJf2rVyZdQiZWrW9Y69hkyS1jmu4JEmSUmbCJUmSlLIOOaV4JLnQXpIkOcIlSZKUMhMuSZKklDmlWEDLmZd1CKk7evbsrEPI1Kwxs7IOQZJUhEy4CugevpN1CKkb/A8Lsw4hU+VnlWcdgiSpCDmlKEmSlDITrgIazNMM5umsw0jV3hVV7F1RlXUYmanaVkXVto57/5Kk1nFKsYCuYjoA5ezKNpAUbbjsMgDGPd8xK67P/fVcACrnVWYciSSpmDjCJUmSlDITLkmSpJSZcEmSJKXMhEuSJCllJlySJEkpM+GSJElKmWUhmlFefujX/ICHCx1GmzP87ruzDiFTd158Z9YhSJKKkAlXAW3hlKxDSF23iROyDiFTE/p17PuXJLWOCdcR1tTIWWtG0yRJUvEw4copRNIzky8D7XsT6y1fXwB03E2syx8vT766ibUk6RC4aL6AprCIKSzKOoxU7ayoYGdFRdZhZGbJi0tY8uKSrMOQJBUZEy5JkqSUmXBJkiSlzIRLkiQpZSZckiRJKTPhkiRJSpllIQpoM5OzDiF1XcePzzqETI3rOy7rECRJRciEq4Bu45GsQ0jdiJ937JIIi2cuzjoESVIRckpRkiQpZSZckiRJKTPhKqByelNO76zDSNWqseNYNbbjrmOatGgSkxZNyjoMSVKRcQ1XG9B4H0c3s5YkqX1xhEuSJCllJlySJEkpM+GSJElKmQmXJElSyky4JEmSUuZTigV0D7dmHULqBt1wQ9YhZGrBmQuyDkGSVIRMuApoOVdmHULq+lw+J+sQMjX7hNlZhyBJKkJ5TSmGEH4YQngihHBdvm1CCL1DCPeFEB4IIfxXCKFzCKEshPByCOHh3GEFSUmS1O61mHCFEC4FSmOMZwIjQwhj8mzzKeDbMcbzgWrgQo7BcrkAACAASURBVOAk4Gcxxum5o7KQN5O1KdzBFO7IOoxU7bhrMTvu6rgbOFesrqBidUXWYUiSikw+U4rTgfp/YR8APgi82FKbGOO/Nni/P/AqcAZwcQjhHKASuCrGWNOwoxDCfGA+wLBhw/K9jzZhJl8B2vfUYvX11wMdd2px4RMLAacWJUmHJp8pxR7Aptzr7cDAQ2kTQjgT6BNjfBJ4Cjgvxnga0An4aOOOYoy3xRinxhin9u/fP+8bkSRJaqvyGeHaA3TLve5J00lak21CCH2B7wKzcu89F2N8O/d6GfC+6UlJkqT2Jp8RruUk04gAk4EN+bQJIXQGKoBrY4wv5d77jxDC5BBCKfBx4NnWBi5JklQs8hnh+gXwaAhhCHARMDeEcGOM8bqDtDkD+BxwKvC1EMLXgO8BC4GfAgH4VYzxN4W7lfajvDy/c5IkqTi0mHDFGHeHEKYDM4BbYozVNBqZaqLNLpIE63tNdHnS4QYtSZJUTPIqfBpj3MG7TyG2uo0kSVJHZKX5AipnV9YhpG7c86uyDiFTlfPaVek4SdIR4ubVkiRJKXOEq0g0XjTvInpJkoqHI1wFNJ9pzGda1mGkav2ls1h/6ayWG7ZTc+6Zw5x7OmaVfUlS6znCVUBDOkBZsX0rV2YdQqZWbe/Ya9gkSa3jCJckSVLKTLgkSZJSZsIlSZKUMhMuSZKklJlwSZIkpcynFAtoOfOyDiF1R8+enXUImZo1puOWxJAktZ4JVwHdw3eyDiF1g/9hYdYhZKr8rPKsQ5AkFSGnFCVJklJmwlVAg3mawTyddRip2ruiir0rqrIOIzNV26qo2tZx71+S1DpOKRbQVUwHoJxd2QaSog2XXQbAuOc7ZsX1ub+eC0DlvMqMI5EkFRMTriLV1ObVbmgtSVLb5JSiJElSyky4JEmSUmbCJUmSlDITLkmSpJSZcEmSJKXMpxQL6Ac8nHUIqRt+991Zh5CpOy++M+sQJElFyISrgLZwStYhpK7bxAlZh5CpCf069v1LklrHKUVJkqSUmXAV0Ey+zEy+nHUYqdry9QVs+fqCrMPITPnj5ZQ/Xp51GJKkItMhphSPVAX2KSwC4B6+c2Q+MAM7KyoAGPwPCzOOJBtLXlwCQPlZ5dkGIkkqKo5wSZIkpcyES5IkKWUmXJIkSSnrEGu4OorGa9WO1No1SZJ0cI5wSZIkpcwRrgLazOSsQ0hd1/Hjsw4hU+P6jss6BElSETLhKqDbeCTrEFI34udLsg4hU4tnLs46BElSEXJKUZIkKWUmXJIkSSkz4SqgcnpTTu+sw0jVqrHjWDW2465jmrRoEpMWTco6DElSkTHhkiRJSpkJlyRJUspMuCRJklJmwiVJkpQyEy5JkqSUmXBJkiSlzErzBXQPt2YdQuoG3XBD1iFkasGZC7IOQZJUhEKMMesYmjV16tS4bNmyw+6nvPzwY2kP/DlIkpSeEMLyGOPUpt5zSlGSJClleSVcIYQfhhCeCCFcl2+bEELvEMJ9IYQHQgj/FULonG9fxWoKdzCFO7IOI1U77lrMjrs67gbOFasrqFhdkXUYkqQi02LCFUK4FCiNMZ4JjAwhjMmzzaeAb8cYzweqgQvz6auYzeQrzOQrWYeRqurrr6f6+uuzDiMzC59YyMInFmYdhiSpyOQzwjUdqB/SeAD4YD5tYoz/GmN8MHeuP/Bqnn1JkiS1K/kkXD2ATbnX24GBh9ImhHAm0CfG+GQ+fYUQ5ocQloUQlr322mt53YQkSVJblk/CtQfolnvds5lrmmwTQugLfBf4i3z7ijHeFmOcGmOc2r9//3zuQZIkqU3LJ+FazrtTf5OBDfm0yS2SrwCujTG+dAh9SZIktSv5FD79BfBoCGEIcBEwN4RwY4zxuoO0OQP4HHAq8LUQwteA7zXTTpIkqV1rMeGKMe4OIUwHZgC3xBirgWdbaLOLJMH6XuP+mmgnSZLUrllpvoPzZyNJUmFYaV6SJClDJlySJEkpM+EqoPlMYz7Tsg4jVesvncX6S2dlHUZm5twzhzn3zMk6DElSkcnnKUXlach7nyVol/atXJl1CJlatX1V1iFIkoqQI1ySJEkpM+GSJElKmQmXJElSyky4JEmSUmbCJUmSlDKfUiyg5czLOoTUHT17dtYhZGrWmI5bEkOS1Hpu7aP38GclSVLruLWPJElShky4CmgwTzOYp7MOI1V7V1Sxd0VV1mFkpmpbFVXbOu79S5JaxzVcBXQV0wEoZ1e2gaRow2WXATDu+Y5ZcX3ur+cCUDmvMuNIJEnFxBEuSZKklJlwSZIkpaxdTin6pJ0kSWpLHOGSJElKmQmXJElSyky4JEmSUtYu13Bl5Qc8nHUIh62p9W8Nzw2/++4jFUqbdOfFd2YdgiSpCJlwFdAWTsk6hNR1mzgh6xAyNaFfx75/SVLrOKUoSZKUMhOuAprJl5nJl7MOI1Vbvr6ALV9fkHUYmSl/vJzyx8uzDkOSVGRMuApoCouYwqKsw0jVzooKdlZUZB1GZpa8uIQlLy7JOgxJUpEx4ZIkSUqZCZckSVLKTLgkSZJSZsIlSZKUMhMuSZKklFn4tIA2MznrEFLRsNL8jD7jGTI4s1AyN67vuKxDkCQVIROuArqNR7IOIXUPXrCkye1/OorFMxdnHYIkqQg5pShJkpQyEy5JkqSUmXAVUDm9Kad31mGk6vI7x7FqbMddxzRp0SQmLZqUdRiSpCJjwiVJkpQyEy5JkqSUmXBJkiSlzLIQapXGpSE6cqkISZJa4giXJElSyky4JEmSUuaUYgHdw61Zh5C6p6bekHUImVpw5oKsQ5AkFSETrgJazpVZh5C6daPnZB1CpmafMDvrECRJRcgpRUmSpJSZcBXQFO5gCndkHUaqRq5ZzMg1HXcD54rVFVSsrsg6DElSkXFKsYBm8hWgfU8tfmDZ9UDHnVpc+MRCwKlFSdKhySvhCiH8EBgP3BtjvDHfNiGEgcDdMcYP5b4/FvgjsCZ32ewY42uHdwtqC6zLJUlS81qcUgwhXAqUxhjPBEaGEMbk0yaE0AdYBPRo0PR04KYY4/TcYbIlSZLavXzWcE0H6hftPAB8MM82tcDlwO4G7c4APh9C+J8Qwjea+rAQwvwQwrIQwrLXXjMfkyRJxS+fhKsHsCn3ejswMJ82McbdMcZdjdrdR5KcfQA4M4RwUuOOYoy3xRinxhin9u/fP4/wJEmS2rZ8Eq49QLfc657NXJNPG4DHY4xvxBhrgaeB901PSpIktTf5JFzLeXcacTKwoZVtAO4PIQwOIXQHzgdW5B2pJElSkcrnKcVfAI+GEIYAFwFzQwg3xhivO0ibM5rp6wbgd8A7wPdjjC+0PvS2p5zGM6jtz11zV2UdQqYq51VmHYIkqQi1mHDFGHeHEKYDM4BbYozVwLMttNnV4L3pDV7/DhhbkMglSZKKRF51uGKMO3j3KcRWt5EkSeqIrDRfQPOZBsBtPJJxJOmZcf8sAB68YMlB27XXQqhz7kkq7C+e6f8tJEn5M+EqoCHvnWltl/ruWJl1CJlatb1jr2GTJLWOm1dLkiSlzIRLkiQpZSZckiRJKTPhkiRJSpkJlyRJUsp8SrGAljMv6xBSt3bk7FZd11RZiGIsFTFrzKysQ5AkFSETrgK6h+9kHULqlp22MOsQMlV+VnnWIUiSipBTipIkSSkz4SqgwTzNYJ7OOoxU9dleRZ/tVVmHkZmqbVVUbeu49y9Jah2nFAvoKqYDUM6ugzcsYuc/cBkAd83tmBXX5/56LgCV8yozjkSSVEwc4ZIkSUqZCZckSVLKTLgkSZJS5houZaZxHa5irMslSVI+HOGSJElKmQmXJElSypxSLKAf8HDWIaTugfPvzjqETN158Z1ZhyBJKkImXAW0hVOyDiF1O/pOyDqETE3o17HvX5LUOk4pSpIkpcyEq4Bm8mVm8uWsw0jV1D8tYOqfFmQdRmbKHy+n/PHyrMOQJBUZpxQLaAqLALiH72QcSXpGrasAYNlpCwved1NlIdpaqYglLy4BoPys8mwDkSQVFUe4JEmSUmbCJUmSlDITLkmSpJS5hkttmtv/SJLaA0e4JEmSUuYIVwFtZnLWIaRue5/xWYeQqXF9x2UdgiSpCJlwFdBtPJJ1CKl78IIlWYeQqcUzF2cdgiSpCJlwqagUQ60uSZIacw2XJElSyky4Cqic3pTTO+swUnX5neO4/M6Ou45p0qJJTFo0KeswJElFxoRLkiQpZSZckiRJKTPhkiRJSplPKaroWY1ektTWOcIlSZKUMhMuSZKklDmlWED3cGvWIaTuqak3ZB1CphacuSDrECRJRciEq4CWc2XWIaRu3eg5WYeQqdknzM46BElSETLhUrvjInpJUlvjGq4CmsIdTOGOrMNI1cg1ixm5puNu4FyxuoKK1RVZhyFJKjKOcBXQTL4CtO+pxQ8sux7ouFOLC59YCDi1KEk6NI5wSZIkpSyvhCuE8MMQwhMhhOsOpU0IYWAI4dEG33cKIdwTQngshPAXhxe6JElScWhxSjGEcClQGmM8M4RwewhhTIzxxZbaAK8Di4AeDZp+CVgeYywPIfx3CKEixvhGAe9Hep+mFs27kF6SdCTlM8I1HahfJf0A8ME829QClwO7m2n3CDD1UIKVJEkqRvkkXD2ATbnX24GB+bSJMe6OMe461L5CCPNDCMtCCMtee+21PMKTJElq2/JJuPYA3XKvezZzTT5t8moXY7wtxjg1xji1f//+eYQnSZLUtuVTFmI5yRThk8Bk4IVWtmnY7u5cuycPMd42rZzGA3rtz11zV2UdQqYq51VmHYIkqQjlk3D9Ang0hDAEuAiYG0K4McZ43UHanNFMX4uA/w4hfAgYD/yx9aFLrWc1eknSkdTilGKMcTfJYvcngXNijM82SraaarOrwXvTG7x+CZgBPAacF2OsPfxbkCRJatvyqjQfY9zBu08XtrpNrt3mfNoVo/lMA+A2Hsk4kvTMuH8WAA9esCTjSLIx556kwv7ime3yt7AkKSVu7VNAQ3g26xBS13fHyqxDyNSq7R17DZskqXVMuCQsjipJSpd7KUqSJKXMhEuSJCllTilKzbB0hCSpUBzhkiRJSpkjXAW0nHlZh5C6tSNnZx1CpmaNmZV1CJKkImTCVUD38J2sQ0jdstMWZh1CpsrPKs86BElSETLhkvJk6QhJUmu5hquABvM0g3k66zBS1Wd7FX22V2UdRmZ2dK5iR+eOe/+SpNZxhKuArmI6AOXsOnjDInb+A5cBcNfcjllx/aFj5wIwa31lxpFIkoqJI1xSK9XFyBNrt/HytreyDkWS1MY5wiW1wuqal1lRu44f/9ubAPQNR3F86SBu+dIgxgzslXF0kqS2xoRLytP+WHPg9eM1lfQLRzGt08nsjW/zUm01T9esZsY/rmZU/x5cOHEQF00czIQhRxFCyDBqSVJbYMIltWBffIdVNRt4vnYDnXPnZnQ6jSElxxxIpiaUjeStuI9JF1aztKqa7/9+Hf/yu7UM7dONCycM4qJJgzjluD6UlJh8SVJHZMIlNePNuJeqmvWsrn2ZGmo5rmQgO3PvHVva/33tu4eufObM4XzmzOFsf/MdfrNyK0urqvnxEy/x739Yz4BeXbhgwiAumjiI00b0pazUJZSS1FGYcEmN7Krbw4ratayt3UQERpYMYVLZKI4u6cWSFq59ty5XZ+A4bi8/jt379vO7519l6Ypq7l6+kf948iX6dO/EjPEDuWjiYM4a3Y8uZaVp3pIkKWMmXAX0Ax7OOoTUPXD+3VmHkJrX63ZRWbOGl+qqKaWEE0uPZ0LZCHqG7gfanLvpzkPu96iunbjk5GO55ORj2ftOLb9f/RpLV2zhvspqFi/bSK8uZZw7bgAXTRzEtBP6072zfywlqb0JMcasY2jW1KlT47Jlyw75Oqt/K18xRqrrtvFc7Vq21L1OJ8oYVzqccWXD6Ra6pPKZ9b8/36mp47G1r7O0spoHVlaz4639dO1UwvQTBnDhxEGcO24AR3XtlEoMkqTCCyEsjzFObeo9/yutDinGyMt1W6msWcvrcSfd6MLUsrGcUDqMzuHIJDmdy0o458QBnHPiAG6qncifNmxn6Ypq7q9KFt53Kg2cPfoYLpo4iBnjB9G3R+eWO5UktUmOcBXQTL4MtO9NrKf+aQFQvJtY18U61tVtprJmLbviHnqF7kwsHcmo0qGUhZbXUS3vVw7AlG3lrY6hpd+fdXWRp1/ZmUw7rqhm4469lAQ4fUQ/Lpo0iAsmDGLgUV1b/fmSpHQ4wnWETGER0L4TrlHrKoDiS7hqYi2ra1+mqmYdb7KPPuEopnU6heElgygJ+T8tuOGoZNn84SRcLSkpCUw5vg9Tju/D//7oOKo27+b+qmruW1HNgl9WseCXVZw67GgumjiYCycO4ri+3VvuVJKUKRMutWtvx/08X7uBlTUbeJt3GBj6cmbZJI4t6Z9ZQdLGI1wHG/EKITDx2N5MPLY3f3P+iax59Q2WrkiSr5v+exU3/fcqJgw5iosmDuLCiYMYPcAq95LUFplwqV16K+6jqmY9L9S+RA21DC0ZwKSyUQws6Zt1aIdl9IBe/PW5vfjrc8fwyva3csnXFr71wGq+9cBqRg/oyYUTkuTLKveS1HaYcKld2V33Jitq17GmdiOROkaUDGFi2Sj6lhyVdWjNamqEK591iMf17c5fThvJX04bydbd+5LF9iuq+deH1/DPv1vDcX275ZKvwZxy3NFWuZekDJlwqV3YVreLypq1vFS3hUAJY0qHMrF0FL1KOsb6poFHdeWzZw7ns7kq9w+uTJKvHz2+gX97dD0Dj0qq3F84cRCnDbfKvSQdaSZcKloxRrbG7VTWrGVT3Wt0oowJpaMYXzac7qHjPsXXt0dnLv/AMC7/wDB279vPQ6uSKveLl73Cj594ib49OjNj3EAunDSIs0ZZ5V6SjgQTrgLazOSsQ0jd9j7jsw6BGCMb616lsmYtr8YddKUzp5adyImlx9Ml5RpaR789LtX+C+2orp34+CnH8vFTjuWtd2p4ZPVr3Leimnsrt3DXslfo1aWMj4xLCq1++IQBdOts8iVJabAOl4pGXaxjfd0WKmvWsjO+QQ+6MbFsJGNKj8urhlYxK/Tv6bdranl8zTbuW7GFB1duZcdb++nWqZTpJ/ZPqtyPHUAvq9xL0iGxDpeKWk2sZU3tK6yoXceeuJejQ08+1GkyI0qGHFINrWLW2oX1zelSVso5YwdwztgB1NTW8af127kvV+X+vhXVdC4t4ezR/bho4mBmjB9IH6vcS9JhMeFSm/VO3M/ztS+xsmYD+3ib/uFoTus0geNKBljuoIDKSks4a/QxnDX6GG742ASefmUH91Um2wv9bslzlP5X4PQRfbloYlLlfoBV7iXpkDmlWEDl9M593ZVNAEfA5Xcma5jumrsqtc/YG99mZc16nq99if3UMKSkPyeVjWJg6Jt5orVkxCQAZq2vzDSOphT6932MkarNu7kvt8XQutfeJAQ4dVifA8mXVe4l6V1OKaoovFH3FlW163ix9hVqqWN4yWAmlY2iX0nvrEPrkBpWuf+7C8by4tZ3q9zfeO8qbrx3FROPPepAra/RA3pmHbIktVkmXMrcjro3qKxZw/q6LQQCo0uPZWLpKI4q6ZF1aGpgzMBejBnYiy99ZAwvb3uLpVXJyFd9lfsxA3pyYW6LofGDrXIvSQ2ZcCkzr9Zt57matWyse5UyShlfOpzxZSPp0YFraB2OQ9mj8XAN69ed+dNGMX/aKKp37csttt/Cv/xuDd99aA3D+nY/kHydPNQq95JkwqUjKsbIprrXqKxZy9a4nS504pSyExhbejxdgk/CFaNBvbsy76zhzDtrONv2vM2DK7eytKqaOx5bz22PrGPQUV25YMJALpw4mA8M72OVe0kdkgmXjoi6GHkpV0Nre9xNd7pyWtl4xpQeR6fgb8M0FLqURD769ezC3NOGMfe0Yezau5+Hnt/K0hXV3LXsFRblqtyfP34gF04cxFmjjqFzmcmXpI7Bf+mUqtpYy5raTayoXcsb8S16hx6cXXYSI0uPpbSD1NDqqHp368QnThnKJ04Zylvv1PD7F5Iq979+bgt3PvUKvbqWcd64gVwwYRAfPqG/Ve4ltWsmXAV0D7dmHULqnpp6Q17t9scaXqh9iaqa9ezlbY4JvZnaaQrDSgYW9WLqU15fkHUIh+VIrvNqqHvnMi6aNJiLJg1m3/5aHl/7OvdVVvPgqq3819Ob6NaplHPG9ueCCVa5l9Q+mXAV0HKuzDqE1K0bPeeg7++Lb7OyZgPP127gHWoYXHIMHyo9mcEl/Yo60ao38o3ZWYdQ9Lp2KuXcsQM5d+xA9h+ocr+F+6u28t+VSZX7D445hgsnDmLGOKvcS2ofLHyqgtgT91JVs47VtS9TSx3HlwxiUtkojik5OuvQdIiy+vNTVxf5n5d3cN+KapauqGbTzr2UlgTOGNmXCycO5oIJAxnQyydYJbVdByt8asJVQFO4A2jfI10j1ywG3h3p2ln3BpU161hXtwmAUbkaWkeXtM8imOt6VQAda6Qriz9PMUZWbNp9oNZXfZX7KcP6cKFV7iW1Ue064WpLo1kdaWuff57zBJU1a3i5bitllDKm9Dgmlo2kR+iWcYTpastb+6Ql6z9jMUZefHXPgSr3q7bsBmDSsb0P1Poa1b99JviSiotb+6ggGibn977zGJ3pxOTSMYwrG05Xa2gpJSEEThjYixMG9uLLHxnDhtffzBVareb/3P8C/+f+FzhhYM8DWwyNG9yrXawXlNS+mHCpRTFGXqqrprJmLXNz56aWjePE0mHW0OoAsqjndTDDj+nBVR8exVUfHsWWXXu5Pzfy9c+/W8N3HlrD8f2655KvQUy2yr2kNiKvfy1DCD8ExgP3xhhvzLdN43MhhDJgXe4A+FKMsePMzRSZ2ljHutpNVNauZXd8k17h3TUzE8tGZhiZlBjcuxtXnD2CK84eweu5Kvf3rajmh39Yzw9yVe7rpx0/MLwvpSZfkjLSYsIVQrgUKI0xnhlCuD2EMCbG+GJLbYBJTZzrBfwsxvj3adyMCmN/rGF17StU1azjLfbRNxzFhzudwvElg7MOTW1EVvW8DuaYnl345GnD+GSuyv1vVyVV7n/2p5f50eMb6NejM+dPSAqtWuVe0pGWzwjXdGBx7vUDwAeBF/Noc0oT57oBF4cQzgEqgatijDWtjF0F9nZ8h1W1G1hVs4G32c/A0Jezy05iSMkxronRQbW1BKx3t05ceupQLj11KG++XcPvVydV7n/1zGZ+9qdXOKq+yv3EpMp9105WuZeUrnwSrh7Aptzr7cCpebZp6txvgfNijFtCCD8GPgr8qmFHIYT5wHyAYcOG5X0jar034z5W1qzjhdqXqaGW40oGMqlsFANK+mQdmopUW1r31aNLGR+dNJiP5qrcP7bmde5bUc2DK7fy8wZV7i+cOJhzxw6gZxfXJUoqvHz+ZtlDMjIF0BNoahy+qTZNnXsuxvh27twyYEzjjmKMtwG3QVIWIo/42oxiKwexq24PK2rXsbZ2IxEYWTKEiWWj6FPSq9lr7pq76sgF2AZ1pHIQhdYWRsG6dirlI+MG8pFxSZX7P65rVOW+rIQPjc5VuR8/kKO7+/StpMLIJ+FaTjId+CQwGXghzzYbmzj3HyGEm4AVwMeBbxxm/GqFbXW7eK5mDS/VVVNKCSeUDmNC6Uh6lVhIUh1Hp9wWQh8ccwwLL5mYVLmvrOb+qmp++/yrlJYEzhzZjwsnDuJ8q9xLOkwtFj4NIRwFPEoyHXgRMBeYHWO87iBtzgBiE+eOA34KBOBXMcavHeyzi63waVsWY6S6bjuVtWvYXPc6nShjXOnxjCsbQbfQJevwJKBt/HmOMVK5aRdLc1sMrXs9qXI/9fg+XJArNzG0j/85kfR+h11pPoTQB5gBPBJjrM63TT7XHUyxJVzzmQbAbTyScSTvijHySt1WnqtZy+txJ13pwoSyEZxYOozOodMh9zfj/lkAPHjBkkKHWhR+OyTZ0ugjmxe30FKFkPWf7xgjq7fWV7nfwvPVbwBw0tDeXDBhEBdNHMRIq9xLyjnsSvMxxh28+8Rh3m3yua49GcKzWYdwQF2sY13dZlbUrGVn3EPP0I0zyiYyunQoZaH1T2T13bGygFEWn51dOvYatiMt68X3IQROHNSLEwf14n+dl1S5X9qoyv2JA3txwcQk+Ro7yCr3kprm4zjtTE2s5cXaV1hRs4432Uuf0ItpnU5meMlgSoJ1h1T8slx8P/yYHnzhw6P4wodHsXnn3gNbDH33oRf5zm9fZHi/7rnkazCTh/Y2+ZJ0gAlXO/F23M/zuRpa+3iHAaEPZ5RNYGjJAP/Sl1Iw5OhuXHn2CK48ewSvvZFUuV9aVc0PH13PD36/jsG9ux6YdpxqlXupwzPhKnJvxX2srFnPC7Uvs58ahpb0Z1LZaAaW9M06NOmIaAvlJvr36sKfnz6MPz99GLve2s9vViXJ109zVe6P6dmZGeOTBfdnjepHp1JHm6WOxoSrSO2ue5MVtetYU7uRSB3DS4YwqWwUfUuOyjo0KVNZr/vq3b0Ts6YMZdaUpMr97154laUrqvnVM5v42Z9eTqrcjx/IhRMGMc0q91KHYcJVZLbX7aayZi0b6jYTKGF06VAmlo7kqJIeWYcmtVlZjYL16FLGxScN4eKThrBvfy1/eDGpcv+bVVv5+f9sonvnUs45cQAXThzEOVa5l9o1/3QX0HLmpdb31rrtVNasYWPda3SijAmlIxlfNoLu4cgWY1w7cvYR/by2ZvjuWVmHoALIYhSsa6dSzhs/kPPGJ1Xun1y3jftWVPNAVTX3Vm6hc1kJ08Ycw4UTBzNj3EB6dz/0si2S2q686nBlpdjqcBVajJGNda9SWbOWV+MOutCZ8WXDGVs6nC6tqKElKX9HtKD/NwAAEW5JREFU6u+W2rrI8pd2JFsMrahm8659lJUEzhzV78AWQ1a5l4rDYRc+zUpHTbjqYh0b6rZQWbOWHfENetCNiWUjGVN63GHV0JJ0eNL++ybGyHMbd3HfimqWrtjChm1vEQJ84Pi+XDAxWXR/7NHdWu5IUib+b3v3HhxXed5x/Pvsri6+yJaML/INW5IBY0s2jkmAYIhDAoMzoc0QEjyh6UzdmRCahslk0o4zpKC6FHpJSEOmaUITGNqQEhPSNLQYSBpSLg1QCEnkCwRLtjG+xjaSL7KkvTz94xzZstBlLe/R2ZV/n5kd7549u3r3QV5+Puc9z6vANUpm8ioAe1g2otdnPMvW7FtsyrZxxDuptok0phqoT8wqmh5aNYc2AfD2lMUxjyQeb5cHn7+m5+z8/HKqKL9/3J3X9x05scRQb5f7pXMmn+j1VTdVczdFiokC1yhpZnL4Z8dpva7H07yefZNNmW100c1Uq2ZJqoG5iRlF10PrxocvBOD7q8/OjuuP1jUB8NFtLTGPRIpRlN9H2w4cC8PXHn79VvAds7C2Kuj11VTLBTPU5V4kbme8tI9E47h3syWzjS3ZHaTJMCsxlabkMmoTU/TFKVKC8glcI92nuXkCt6xs4JaVDexqP86T4ZGve3/2Bl8Lu9xf2ziTVY21LFGXe5Gio8AVgyO5TjZl23gju5MsOeYlamlKLWBqYnLcQxORiBXiCsnZ1eNYs6KONSuCLvdPbQ7C17efbeOb/9PKrMmVwZyvxepyL1IsFLhG0du5I7RkWtmW240BDWEPrcmJiXEPTURidCanIqdVVXDTJfO46ZJ5tHf28NMtQaPVh158kwee387UiRVcszhotHqZutyLxEaBaxTsz71NS6aVnbl9pEiyKDmfRak6JpiuNhKRwqkeX84Ny+dww/I5HO3O8PRr+3li015+9OouvvfiyS73qxpncsV5U9XlXmQUKXBFxN3ZnTvAbzJb2eeHqKCMi1LnsTA5n0orj3t4IlJiTrdb/sSKFNctncV1S4Mu98++cYANG/fw081Bl/sJ5UlWLpzOqsZa3n/BdCaoy71IpPQ3LALbsrtpybRyyA8znkrenVrE+cm5lJnKLSKFcTqT7yvLkly9aAZXL5pBT+Zkl/ufbN7Lf/2mt8v9NFY11vJBdbkXiYQSQIFkPctt2YfZmtvFi7lXmWQTuDy1hPrkbJJF0kOrEJ665gdxDyFWV+16OO4hiJyR8lSCK8+fxpXnT+POjzTy8vZDbNi4lyc3BWs89na5X9U4k6sXzWBaVUXcQxYZE9SH6wylPcPr2TfZnGmjk27Osck0pRo4N1FLQpdli0gRGer70t359VsdbNi4hyc27mVHb5f7+VO4dnHQ5X6WutyLDEmNTyPQ5T0nemj1kGZm4hyakguYmThH/W9EpGQ1Nwfh67W9J7vcv74v7HI/t5prF9eyqrGW+epyL/IOClwFdMyPszET9NDKkOXcxAyaUguYlqjmOm4F4DHuHd1BjaKLX7odgJffsy7mkcTjlXOaAVh+sDnWcYiMpo7cUXbk9rIju5eDHnS5n109jurxZUweF9yqx5cxadzJxwPdqirL1BNMxjR1mi+A9txRNmZbac3uAqA+MZumVD3ViaoT+yznQWBsB66GtkeAszdwbZ/0KKDAJWeXyYmJLEksYElqAUe9kx3ZfRw83E7H4TT7PU0PXXR7hh7S5MgN+V5VlalBA1n/wNY30CmsSalT4BrGgVw7LZlWduT2kiTBwuQ8FqfqmageWiJyFppo41mcqhvwOXcnS45u0vR4mh7SdPf909P0pNN096Tp6EhTOTfNG/uP0nE8TUdnmp5s4cJa/+cU1iRuClwDcHf25A7Skt3KntxBykmxNLmAC1PzqTRdsSMiMhAzI0WSFEkmWOXwL9h38q6nnLV35ILwNcjtcL/HJ8La8TQ9mWHCWkWKSf2OmimsyWhS4OrD3Xkzt5eWTCsHvINxVHBxaiEXJOeph5aISITMjL+9KwkkgcHD2lebB97elc6eGtA6Bw9uIw1r7whk40+GsmqFNRmGUgSQ9Rxt2V20ZFs57MeosvG8N9VEQ3I2SdPSFyIixWLwC6WGD2uTw1v/98knrPU9utb6u5NhrbsAYW3AI2uVKVJa93JMOasDV9ozvJHdyaZMG8foosYm8b6yZcxLzFQPLRGRMezU4DZwWMvnKniFNcnXWRm4ur2HLdkdbMlso5s0M2wKl6WamJ2YdkY9tHaztICjLE6HahbFPYRYVXdfGPcQRGSU5Nd26NSwdjqtirrS2XfMS+t7a+8ceVibWJHqE8pOvdigeny5wloMzqo+XMe8i82ZNl7PvkmGLHMS01mSamB6YsqZDVRERGQAUfWKHC6sDXahQcfxNF3pkYe1wVt4lCusoT5cHM4doyXsoeU4dYmZNKUaqElMintoIiIyhkUVuJqbk1SWJZk+KY+rQfs53bC27cCxSMNa722sh7UxHbgO5jpoybSyPbeHBAnOT85lcbKeqsT4uIcmIiIyYvkEuYH2CbadPA16uoGwO5Md+MhZZ5qO4xnaj/eMOKxNKE++I5D1b+MxUFibNK6MshIIa2PulKK7s88P8ZtMK7tzv6OMFAuT81iUqmNcxD20msPrX5rpiPTnxOnGh4M5TN9fvSXmkcTj0bomAD66rSXmkYiIjI5CHaUbLqwNdSr0eDo75HsPFNaGutCgOqKwdlacUnR3dub205LZyu+8nUrKeVfqAhYm51FuZXEPT0REpCSN5GjawEfXkkyvSjK96vRPg55uWNtxsDOysDZlfDlTJpZTVZE6rQvtSj5w5TzHttxuWjKttPtRJto4Lk01siA5h5R6aImIiERupKc48zd8n7Wq8Nb/53Rnstz+V5kTy0z1eJoPX3/q1aAjCWvlyQSpbBkVVkGllVNJ+ZCfoGQDV1c6y/qXd/LDnjaO+nGqrYoryi6iLjGThBX/uVwREREpvHcGuyTjLHnKtKJf/cepe/SGtX4vI5vI9VsTtIdu0nR5D13eQ3ey+8T9A3QOOa6SC1wdx9N894Ud3P/cNg4e62G61XBJ2WLmJKafUQ8tERERkb6SlmAcFXnPAd84xHMlE7j2H+ni/ue289ALOzjSnWHlBdO45X0NPP7gFAUtERERKWpFH7jePNjJt55p5ZFX3iKTzfGhppncsrKBxbOCKwI3KGuJiIhIkSvqwLXzUCcrv/w0qUSCjy6fw81X1jN/6oS4hzWox/iHuIcQuf+7+C/jHkKslh24Pe4hiIhICSrqwHW4K8PnrqhnzYo6Zoygm+5oe4U/insIkWtb8PG4hxCr+iMfi3sIIiJSgoo6cC2sreKLH9JiwSIiIlLairp/QjJRWhO0lvMAy3kg7mFEqn7reuq3ro97GLFpq3qEtqpH4h6GiIiUmKI+wlVqruNzwNg+tfjul+8Azt5Ti69OXQfo1KKIiJyekgtcUa28LiIiIhKVoj6lKCIiIjIWKHCJiIiIREyBS0RERCRieQUuM/uOmf3CzL50Ovvku01ERERkLBs2cJnZ9UDS3S8D6s3svHz2yXdbYT+OiIiISPExdx96B7N7gSfc/XEzWw2Mc/cHhtsHWJbPtgHe61PAp8KHjQy9+LacuanAgbgHMYapvtFTjaOnGkdL9Y3eaNV4nrtPG+iJfNpCTAB2hfcPAe/Kc598t53C3e8D7gMws5fd/eI8xigjpBpHS/WNnmocPdU4Wqpv9IqhxvnM4TpKcHQKYOIgrxlon3y3iYiIiIxp+QSeV4AV4f2lwPY898l3m4iIiMiYls8pxR8Bz5rZLGAVsNrM7nT3Lw2xz6WA57ltKPed1qeRkVCNo6X6Rk81jp5qHC3VN3qx13jYSfMAZlYDXA084+57890n320iIiIiY1legUtERERERk6T1kVEREQiVrSBSx3pC8/MJpvZBjN7ysz+3czKVedomNkMM3s1vK8aF5iZfcPMrgvvq74FZGY1Zva4mb1sZt8Kt6nGBRJ+Nzwb3i8zs8fM7HkzWzPYNslfv/qea2Y/N7Ofmdl9FoitvkUZuNSRPjI3Afe4+zXAXmA1qnNUvgyM0+9y4ZnZFUCtuz+m+kbik8BDYc+iKjP7c1TjggjnMD9I0JMS4LPAK+5+OXCDmVUNsk3yMEB9bwZucfergLlAEzHWtygDF7ASWB/ef4qTrSTkDLj7N9z9J+HDacAfoDoXnJldBRwjCLUrUY0LxszKgH8GtpvZ76P6RuEg0Ghm1QT/k6pDNS6ULHAjcDh8vJKTtX0GuHiQbZKfU+rr7re5+5bwuXMIOs2vJKb6Fmvg6t+RfkaMYxlzzOwyoAbYiepcUGZWDvwFsDbcpN/lwvpDYDPwd8B7gM+g+hbac8A84FZgC1COalwQ7n7Y3Tv6bBro+0HfGSM0QH0BMLMbgU3uvpsY61usgUsd6SNiZlOArwNrUJ2jsBb4hru3h49V48JaBtwXtpT5LsG/UFXfwroD+LS7rwNeAz6BahwVrcgSMTOrB74AfC7cFFt9i/U/pDrSRyA8+vII8EV334HqHIUPAp8xs58DFwHXoRoX0lagPrx/MTAf1bfQaoAmM0sClwB/g2ocFa3IEqFwTte/AWv6HPmKrb75dJqPw0Cd6+XM/THBguG3mdltwAPAJ1XnwnH3K3vvh6Hr99DvciF9B7jfzFYDZQTzMX6s+hbU3QTfDfOAXwBfRb/DUXkQeDy8EGQR8CLB6a7+22Rk1gLnAl83MwiO3g5U81FRtI1P1ZF+dKjO0VONo6X6Rk81jk4YZFcAT/YehRlomxROXPUt2sAlIiIiMlYU6xwuERERkTFDgUtEREQkYgpcIlIyzGxpuDxHImyC2ve5cjN7x3eamVWa2cRh3rem0GMVEelLgUtESsnHgGZgCfBEuC5ou5k9BTwBLDSzu8PeO70+Djw0zPt+z8wG7DhtZuvM7P1m9tdmttbMqszsybBtgohIXjRpXkRKRngE6w6C0DXN3feb2QvufqmZNRE06pxHELA+RrCawhPh9kfc/bk+73MA+OUgP2oZQQfqSuDTBN3W6wj+kfpVYJ27X997RM3dcxF8XBEZQxS4RKQkmFmZu6fD+7OBf3H3D/QJXL8FrnL3t8ysxt3fNrPPE3SW/lfgxwQd1FvDoPRtgqWBurzfF6GZ3Qt8niB0/Wl4+xXwv8Bk4L1AO7AA+Ii7vxR9BUSklOmUooiUirvMrMXM7nb3XUCXmS0EMLPLCY5W3WBmvwY+a2Y3AdcDm939OMGRqvVmdpW759x9DfD3wNNm9vPw1m5mP3b3W909Q7AY7nzgK8BbQC1B9/XbgJuB9QpbIpIPHeESkZJhZisIGnCuAxYCO4CfAqsJ1kf7LXAFQQf6FuAIcDnwNYIO0/8IvObu2/u9bwNwO/A8cH8YtjCzOQTrrX2SYAWMNHAX8BJBF/YL3P3eqD6viIwdxbq0j4jIYBz4E+Aj4f0LCU4PQrCY9TMA7v5oGNCuJZhkf4+7Pw0QTpC/B+idezUVmEQw/+sTFqwD8mXgEHAncB7BumsXEczl+mH4ug2RfUoRGVMUuESk1JxDcHT+AwDhHK4P9j5pZiuBcjP7DvB94EV3v7Xfe7wCvN/ds+FrPgyscPe14eMUkHD3HjNbT7B+4AZgk7unzeyXBIFvXZQfVETGDs3hEpFS0ghcA6wfYp8a4LMEp/w6gcO9T5jZF81spgeyfV5jBEfLAHD3jLv3hA+/Cfw38CWg0czqgMVAN8Fi8CIiw9IRLhEpJf9JMFH9UJ9t/Zua7gY+7u4bzGwusDo8tdjbKPWevjub2aeBPwPW9v9hYUPUfwLaCI5yLQIeAL4A7AN+YGafcPfWM/5kIjKmadK8iJzVzKwCyPQ74tX3+VSfSfRGcKqx91Sk9W8pISIyEAUuERERkYhpDpeIiIhIxBS4RERERCKmwCUiIiISMQUuERERkYgpcImIiIhE7P8BMh3A7p4k8dcAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 720x576 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
    "sn.distplot(dataset['comment_length'],\n",
    "            bins=dataset['comment_length'].max(),\n",
    "            hist_kws={\n",
    "                \"alpha\": 0.5,\n",
    "                \"color\": \"blue\"\n",
    "            },\n",
    "            ax=ax)\n",
    "ax.set_xlim(left=0, right=np.percentile(dataset['comment_length'], 95))\n",
    "ax.set_xlabel('评论字数')\n",
    "ymax = 0.04\n",
    "plt.ylim(0, ymax)\n",
    "ax.plot([mode, mode], [0, ymax], '--', label=f'mode = {mode:.2f}', linewidth=2)\n",
    "ax.plot([mean, mean], [0, ymax], '--', label=f'mean = {mean:.2f}', linewidth=2)\n",
    "ax.plot([median, median], [0, ymax],\n",
    "        '--',\n",
    "        label=f'median = {median:.2f}',\n",
    "        linewidth=2)\n",
    "ax.set_title('影评的字数分布', fontsize=16)\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:21:08.272172Z",
     "start_time": "2020-05-14T01:21:08.251709Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5.0"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 字数少于 5 的评论占了 5%\n",
    "np.percentile(dataset['comment_length'], 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:24:55.075479Z",
     "start_time": "2020-05-14T01:24:55.064532Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>comment</th>\n",
       "      <th>star</th>\n",
       "      <th>cleaned_comment</th>\n",
       "      <th>comment_length</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>191</th>\n",
       "      <td>赞</td>\n",
       "      <td>5</td>\n",
       "      <td>赞</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198</th>\n",
       "      <td>赞</td>\n",
       "      <td>5</td>\n",
       "      <td>赞</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>247</th>\n",
       "      <td>好</td>\n",
       "      <td>5</td>\n",
       "      <td>好</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>752</th>\n",
       "      <td>屎</td>\n",
       "      <td>1</td>\n",
       "      <td>屎</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2135</th>\n",
       "      <td>哎</td>\n",
       "      <td>3</td>\n",
       "      <td>哎</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2140</th>\n",
       "      <td>哎</td>\n",
       "      <td>3</td>\n",
       "      <td>哎</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2760</th>\n",
       "      <td>牛</td>\n",
       "      <td>5</td>\n",
       "      <td>牛</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3420</th>\n",
       "      <td>x</td>\n",
       "      <td>4</td>\n",
       "      <td>x</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4352</th>\n",
       "      <td>烂</td>\n",
       "      <td>1</td>\n",
       "      <td>烂</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4605</th>\n",
       "      <td>爽</td>\n",
       "      <td>5</td>\n",
       "      <td>爽</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     comment  star cleaned_comment  comment_length\n",
       "191        赞     5               赞               1\n",
       "198        赞     5               赞               1\n",
       "247        好     5               好               1\n",
       "752        屎     1               屎               1\n",
       "2135       哎     3               哎               1\n",
       "2140       哎     3               哎               1\n",
       "2760       牛     5               牛               1\n",
       "3420       x     4               x               1\n",
       "4352       烂     1               烂               1\n",
       "4605       爽     5               爽               1"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 一个字的评论\n",
    "dataset[dataset['comment_length'] == 1].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:26:26.315090Z",
     "start_time": "2020-05-14T01:26:26.303579Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>comment</th>\n",
       "      <th>star</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>204</th>\n",
       "      <td>温馨</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>216</th>\n",
       "      <td>温馨</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>525</th>\n",
       "      <td>幼齿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>565</th>\n",
       "      <td>疯了</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>606</th>\n",
       "      <td>疯了</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>642</th>\n",
       "      <td>战狗</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>648</th>\n",
       "      <td>战狗</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>681</th>\n",
       "      <td>……</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>788</th>\n",
       "      <td>傻逼</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1261</th>\n",
       "      <td>性感</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     comment  star\n",
       "204       温馨     5\n",
       "216       温馨     5\n",
       "525       幼齿     1\n",
       "565       疯了     5\n",
       "606       疯了     5\n",
       "642       战狗     1\n",
       "648       战狗     1\n",
       "681       ……     1\n",
       "788       傻逼     1\n",
       "1261      性感     5"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 两个字的评论\n",
    "dataset[dataset['comment_length'] == 2][['comment', 'star']].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:27:09.630990Z",
     "start_time": "2020-05-14T01:27:09.616275Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>comment</th>\n",
       "      <th>star</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>没看过</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>136</th>\n",
       "      <td>にまび</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>174</th>\n",
       "      <td>超感动</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>176</th>\n",
       "      <td>超感动</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>206</th>\n",
       "      <td>好看！</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>210</th>\n",
       "      <td>励志。</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>218</th>\n",
       "      <td>好看！</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>222</th>\n",
       "      <td>励志。</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>234</th>\n",
       "      <td>煽情。</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>242</th>\n",
       "      <td>煽情。</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    comment  star\n",
       "110     没看过     1\n",
       "136     にまび     1\n",
       "174     超感动     5\n",
       "176     超感动     5\n",
       "206     好看！     5\n",
       "210     励志。     4\n",
       "218     好看！     5\n",
       "222     励志。     4\n",
       "234     煽情。     3\n",
       "242     煽情。     3"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 三个字的评论\n",
    "dataset[dataset['comment_length'] == 3][['comment', 'star']].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:28:01.866145Z",
     "start_time": "2020-05-14T01:28:01.850034Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cleaned_comment</th>\n",
       "      <th>star</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>中二得 很</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67</th>\n",
       "      <td>3d 扣分</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>141</th>\n",
       "      <td>爱 与 坚持</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>162</th>\n",
       "      <td>勇敢 面对</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>180</th>\n",
       "      <td>励志 大片</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>208</th>\n",
       "      <td>mit</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>220</th>\n",
       "      <td>mit</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>251</th>\n",
       "      <td>平民 励志</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>258</th>\n",
       "      <td>平民 励志</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>265</th>\n",
       "      <td>苦尽甘来</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    cleaned_comment  star\n",
       "4             中二得 很     1\n",
       "67            3d 扣分     2\n",
       "141          爱 与 坚持     3\n",
       "162           勇敢 面对     4\n",
       "180           励志 大片     5\n",
       "208             mit     4\n",
       "220             mit     4\n",
       "251           平民 励志     4\n",
       "258           平民 励志     4\n",
       "265            苦尽甘来     4"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 四个字的评论\n",
    "dataset[dataset['comment_length'] == 4][['cleaned_comment', 'star']].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:33:59.031110Z",
     "start_time": "2020-05-14T01:33:58.906874Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/yangbin7/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEGCAYAAACJnEVTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3de5yN5f7/8ddnnEKGwTTKYRxSQqgm6YChFDpQ2+5cv7J37L2p2ZXd1tlPbV+d9nc6sEMHPUqiohzKrv0LSchIpY2KkIxxyGFSKsbn98e6jWFmWLNmrFH3+/l4rEf3/Vn34bqm8V7Xuu57zTJ3R0REwiGhvBsgIiLxo9AXEQkRhb6ISIgo9EVEQkShLyISIgp9kcPAzKyIWsXyaItIQQp9+VUys7pmlhAsNzazE4LligeGq0VUKeY4LYL/NjWzrsVsM9DMjjKz98zsZDP7m5klmtkoM+tUTBM7mNm7B9Rmm9mph+jXIdtzwPYJZjbbzFIPta0IKPTlCGVm/2tm/zWzWcHjEzMbEzyXBMwCrgo2bwrMMLNGwB+ALDNbZGZbzewLYBGwyMxqHHCOC4HXglG5A6PNrFoRzakI3APsBqoDV7p7LtAV+LaYLpwMzC1wrlpAXWDxQfocbXsK6gFUc/c1h9hOBIj8MoscifYQGZTs/R2tAPxiZscDI4EMd/9/AO7+npndC5wOjAEmuPs2M3sDyARmAye7+/d7Dx68S7gfuNsjn1BcZWZvBdv3K7DdUUA2kRBOAtKB94MR/tHu/nWwXRV3/zlYnhq0ZaeZ9QYeBBKBWsBXwcxPPXc/Oob2nAmMB3YQeRE6AVhvZp8U+NlZ8HMb4O6zov+RSxiYPpErRyIzOwdoDqQFpYXAWqAy8DGwDbgX+L/A7iAoCUL2ene/rEDo7wJeAU5w953BdncA3dy9W4FzHg3MB/4NDHJ3N7NEYBjQnkiQvwhsAVKIjPTXAo2A74F27v69mS0FzgpeeB4EVgI3AP3c/YvgXCvdvVmBc0fVngN+RlcBfd29m5k9A9zi7j/G8OOWENH0jhxxzOwvwD+B/wOcC3QjEpr/AE5x9w3A/wIbiIT+cjPbFYTkm0CbvXPjgQHAsAKBfy7wVwqMoAHcfQeRID+PyGj+RGAn8DWwDphD5MVmK9AMeMrd04D3iEz57H0nseeALnWOHD4S+AduU8L27N2nMfBAgX26EHk3InJQmt6RI467jzSzZcBpBzz1ubvPMLObgcru/mRQv8vMVgM/ExnI/B9gU4H9pgGTzKwCUA0YTeRdwjwzq0pkOiQ32LY2kQA+HcgDGgCbgTuAoUSmYHoDlQq0rxGR0XxxPgKeMbNn3f0PQW0PQHCdoSTtwcyOBaYDqcDkYLqoATDfzDxoz3XuPv0gbZKQ0vSOHJHM7DGgCpAVlFoANYhMrbQB+hAJxI3BNMxq4HjgLSCZSEA2AzYSmXqpDLzq7g+YWWV3/yU4z8PAOnd/PFifQeRdwfvB+mnAM0Tm4xOITOfsAnoSuZjcHXjf3U8u0PbPiYy684B6wGB3H2tm84H73f3fZvaFu58YbF+S9rQFJgGPA3e4e4OgvgJo7e4/mdlYYJy7H3j3kIimd+SIlUdkyqIf8Gcid6kY8BTwOyIh/i6R0N2rDvCgu58STLvMBvq7e5q7t3H3BwD2BmygM7CgwHp9CtyR4+6LgHOA7UQumj4MfBFMFU0C3iAyvXOgju7ejsgLxl6PAdcHF4d3FDhH1O0J2nGXuz9RxDkL0mhOiqTQlyPV/wDXEQnZZcDfgNuC+fyqRKY3Zrj728H2FYG3gWOiPYGZ9QIquPv8AuVjiczfF3QckXccM4BRRG4BhUjodwKmRnnKyUBfoCb7pm9K1B53X+3uE4LVQv9+zawykRe/3VG2SUJGoS9HnOA+/NeITOkkEJn/bg3cbmbtgA+B6e5+R7D9iUTm2Pu7+2sFDxU8ijpHLyK3fv45WE80s7OAn/beelnACiJ3/1QB7gZamdllwARgEJH76TsW2L4SMCe4jfKPwTpBP4zIu5MVpWjPXgU/cFYxOM8XROb0PytmHwk7d9dDjyPqAZxE5I6VJkAO8MegfhWRsOxaxD41i6jNALoXUX8c+C9wWoHaHcDnRG73LLhtNWAmkbuF6gW1a4H/ELkFFCL37r8LVA/W/whUCZbbFtjuWCLvDu4AEmNpzwFtq1JgeR2RzxFUKu//f3oc2Q9dyJUjmpkd7ZFbF/d+UKqCu/9QymPWIDKC3lUWbSytI6098tum0BcRCRHN6YuIhIhCX0QkRI7oT+TWrVvXGzduXN7NEBEplV9+iXwUo3LlynE536JFiza7e3KRT5b3leSDPU477TQXESmtjz76yC+55BI///zz/a233nJ39xYtWnhKSoqnpKR4/fr1oz7Wfffd5/fff3/++r333uvJycnetWtX/+6774rc59Zbb/VNmzb5bbfd5q1atfItW7Z4Xl6ev/rqq6XqV3GALC8mV4/okb6ISGlt376da665hlGjRmFmXH755SxevBgzIycnp0THWr58OcOHD+fOO+8EYPr06UyePJkVK1Ywffp07rnnHkaOHLnfPl999RW1a9embt26ZGVl8Yc//IH333+fihUrcsopp5RZP6OlOX0R+U1bv349Dz74IF26dCE9PZ2GDRvy9ddfc/LJJx965wLcnf79+3PppZfm1yZNmsTAgQNJTEzkyiuvZM6cOYX2e+KJJ7jlllsAMDNq1qxJbm4uK1asoFmzZoW2P9wU+iLym9aiRQsuv/xy8vLyeP3119m1axefffYZH3zwAccddxzNmjVjypQphzzOM888Q2pqKt277/tzT99++y1t2rQBIoFeqVIlfvhh38dIlixZQuPGjUlMTARgz549bNmyhc2bN9O8efMy7ml0FPoiEgqPP/441157Lf3796dGjRo89NBDZGdnM27cOG666SZ+/rm4v3YBGzZs4LHHHiMzM3O/el5eXn6gA1SrVo3t27fnr48YMYK//OUv+eudO3fmlVde4ccff2TmzJmce+655OXllWEvD02hLyKhcNttt/HVV18xbNgwzjrrLK699loAOnToQGpqKp9++mmx+2ZkZDB06FBq1669Xz0pKYlt27blr+/cuZOEhEisLliwgLZt21K1atX85x944AHmzJnDMcccw8KFC2nZsiVLly4ty24eki7kishv2ooVK/jxxx9p06YNDRo04PTTT2fu3Lk0atQo/xbKdevW5Yd1UaZPn86sWbO45ZZb2LlzJ3l5eWzdupW0tDTmzZvH2WefzY8//siqVauoU6cOEJkOGjFiRKFjTZ48mUsvvZSXX3650ItGPCj0ReQ3LTs7m/79+zNv3jx27tzJwoULqV27NtnZ2WRkZPD8889TuXLlg17Y/f777/OXx44dy+rVqxkyZAgrV64kPT2dNm3aMHnyZLp27UqlSpWYOXMm55xzTpH35W/evJm6desCkJubu9/0UDxoekdEftM6derE9ddfz0knnUTHjh3JzMxk+PDhzJgxg5SUFCZOnMiUKVOoUqUKa9euzb8wG41mzZoxcuRI7rrrLrKzs3nqqacAePHFF/OnjwpaunQpZ5xxBgCtWrViwYIFtGrVqmw6GqUj+g+upaWleVZW1qE3FBE5gqxZs4bU1NRyO7+ZLfLIt8cVoukdEfnNaDz4SPou+M9Ltffq4ReWUTv2p+kdEZEQUeiLiISIQl9EJEQU+iIiIaLQFxEJEYW+iEiIKPRFREJEoS8iEiIKfRGREFHoi4iEiEJfRCREFPoiIiESVeibWYqZzTmg1trM3g2WK5nZVDOba2Z9S1ITEZH4OWTom1kS8AJQvUDNgH8ClYLSzcAidz8b6GNmNUpQExGROIlmpJ8HXAHkFqjdCMwssJ4OTAyW3wfSSlATEZE4OWTou3uuu+d/vbuZ1QGuBR4tsFl1YF2wvAVIKUFtP2bWz8yyzCxr06ZNJeuNiIgcVCwXcocDd7r7rgK1HcDer3w/OjhutLX9uPtod09z97Tk5OQYmiciIsWJJfQ7Aw+Z2SygnZk9CCwCzgmebwusLkFNRETipMRfl+juJ+xdNrNZ7n6PmaUCb5lZR6AlsIDINE40NRERiZOoR/runl5czd3XAN2AucB57p4Xba3UPRARkaiV2Reju3s2++7MKVFNRETiQ5/IFREJEYW+iEiIKPRFREJEoS8iEiIKfRGREFHoi4iEiEJfRCREFPoiIiGi0BcRCRGFvohIiCj0RURCRKEvIhIiCn0RkRBR6IuIhIhCX0QkRBT6IiIhotAXEQkRhb6ISIgo9EVEQiSq0DezFDObEyw3MrNZZvaemY22iEpmNtXM5ppZ32C7qGoiIhI/hwx9M0sCXgCqB6X+wJ/dvSvQEDgZuBlY5O5nA33MrEYJaiIiEifRjPTzgCuAXAB3v9vdlwXP1QE2A+nAxKD2PpBWgtp+zKyfmWWZWdamTZtK1hsRETmoQ4a+u+e6+/YD62Z2BfBfd88m8i5gXfDUFiClBLUDzzfa3dPcPS05ObmE3RERkYOJ6UKumTUFBgF/DUo7gKrB8tHBcaOtiYhInJQ4dIM5/vFA3wLvABYB5wTLbYHVJaiJiEicVIxhn8FAI+BJMwO4n8iF3rfMrCPQElhAZBonmpqIiMRJ1CN9d08P/vt3dz/W3dODx2x3XwN0A+YC57l7XrS1su6QiIgUL5aRfpGCC7oTY6mJiEh86EKqiEiIKPRFREJEoS8iEiIKfRGREFHoi4iEiEJfRCREFPoiIiGi0BcRCRGFvohIiCj0RURCRKEvIhIiCn0RkRBR6IuIhIhCX0QOuz179vDxxx+XdzMEhb5IKAwfPpyaNWtSq1YtHnvssWJrh7Jq1SoaNWrE9u37vjZ76NChNGjQgAYNGvD8888Xud/LL7/M5s2bef3116lfvz7z5s0DYMKECaXsmZRUmf09fRE5Ms2bN4/x48ezePFifvrpJzp06ED79u0L1S6//HIaNmx40GPdcsst3HnnndSsWROAzz//nNmzZ7NmzRqys7M57bTT6NmzJykpKfn77N69m5kzZ/Lss8/Su3dvXnzxRSZOnEjz5s3Zs2fPYe27FKbQF/mNq1atGi+//DJNmzYFoHHjxuzZs6dQLTs7+6ChP2XKFFatWkW/fv3ya8uWLSMtLY0KFSrQsGFDUlNTycnJ2S/0n3vuOfr27QtAbm4ujRo1Ijc3l0mTJnHdddcdji7LQWh6R+Q3rm3btrRq1QqA7Oxs1q5dS1paWqFa69atiz3Grl27uPXWW6lSpQq/+93v8qdlWrZsyZQpU1i/fj1z5sxh8+bNnHTSSfn7/fTTTyxatIizzz4bgFq1arFixQpq1KjBrl27qFq16uHqthQjqtA3sxQzmxMsVzKzqWY218z6lrYmIvFz9913079/f6pXr37Q2oFeffVVNmzYwIABA+jTpw8DBw5kxowZtGzZkoYNG3LhhRfSt29fBg0aROXKlfP3e/rpp/nTn/6Uv963b18GDBhAUlISO3fupEWLFqxevfqw9FWKdsjQN7Mk4AVg72/EzcAidz8b6GNmNUpZE5E4ePvtt/nggw+49957D1oryoIFC7jpppvo27cv1157LQMHDmTq1KmMHTuWJk2a8PHHH/PFF1/wxhtvsHDhQgB27NjBihUrOOWUU/KPc9FFF7Fy5Upq1arF3LlzGTRoEFOnTj08HZYiRTPSzwOuAHKD9XT2fbH5+0BaKWsicpitXbuWP/7xj4wbNy5/RF9UrTiJiYk0atQof/2oo46iVq1azJ8/n5YtWwKQkJBAu3btyMrKAuDJJ59k4MCBhY61cuVKjj/+eLZv306TJk3Ytm1bWXVTonDI0Hf3XHffXqBUHVgXLG8BUkpZ24+Z9TOzLDPL2rRpU8l6IyKF/Pzzz1xyySX87W9/o3379sXWDqZTp0689tpr/PDDD2zZsoWXXnqJzp0706RJEyZMmMCnn37KrFmzGD9+PK1bt2br1q1s3LiRFi1aFDrW22+/TY8ePahRowbffvstiYmJZd5nKV4sF3J3AHuvvhwdHKM0tf24+2h3T3P3tOTk5BiaJyIFzZgxg08++YThw4dTr1496tWrxyuvvFKo9vrrr7N27VratGlT6BjdunWjd+/etG7dmubNm3PRRRdx/vnnM2DAAOrXr0+nTp3o06cPN954Ix07diQzM5OMjIxCx8nLyyMxMZGEhASuvPJK7rvvPnr27BmPH4MEzN2j29Bslrunm9l9wFJ3f83MXgBGAefFWnP3D4s7Z1pamu99qygivx6rV6+mcePGcT9v48HT437Ow2X18Atj3tfMFrl7kdPnsdyn/wLwlpl1BFoCC4hM2cRaE5EydOQE339LtXdpQk+KF/X0jrunB/9dA3QD5gLnuXteaWpl2hsRETmomD6R6+7Z7LsLp9Q1ERGJD30iV0QkRBT6IiIhotAXEQkRhb6ISIgo9EVEQkShLyISIgp9EZEQUeiLiISIQl9EJEQU+iIiIaLQFxEJEYW+iEiIKPRFREJEoS8iEiIKfRGREFHoi4iEiEJfRCREFPoiIiFS4tA3syQze8vMssxsVFB71szmmdk9BbaLqiYiIvETy0j/OmCcu6cBNczsDqCCu58JNDWz5mZ2WTS1MuuFiIhEJZbQ/w5obWa1gIZAE/Z90fk7wDlAepQ1kVBZuHBheTdBQi6W0P8ASAVuAZYBlYF1wXNbgBSgepS1QsysXzB1lLVp06YYmidSvHHjxnHjjTfuV9u9ezetW7dm7ty5h9z/mWeeITU1lZSUFIYNG3bIekFz587lk08+4cMPP6R+/fq8/vrrAEyYMKEUPRIpmVhC/37gT+4+FFgOXA1UDZ47Ojjmjihrhbj7aHdPc/e05OTkGJp35NNor3xMmzaNAQMG4O771TMzM2nVqhVnn332Qff/7rvvGDFiBMuWLWP58uW88MILZGVlFVs/0HPPPceNN97IxIkTefHFF3nppZfYtWsXW7duLdN+ihxMLKGfBJxsZhWAM4Dh7JuqaQusBhZFWSsXpR3tXXDBBaSkpFCvXj3q1avH+vXr85+bPXs25513XrH7rlq1iunTp7NixQqaNm3K448/DsAbb7zBzz//HGOPJBrPP/88995773617OxsHn74YR566KFD7v/1119zwgknUK1aNZKSkmjTpg3r1q0rtl7Qv//9b7p27UrFihXJzc2lUaNG5ObmMmXKFC6++OIy7afIwcQS+v8DjAa2A7WB/wWuM7N/ApcD04E3oqzFXWlHewBffvkl69evJycnh5ycHI499lgAPvroI66++mp2795d7L6ZmZlkZGQwbdo0HnnkEd58800AvvnmG6pUqVKKnsmhvPbaa9SpU2e/2uDBg6lSpQo333wzw4YNY9euXcXu36xZMz788EO+/PJLli5dyvz58znrrLOKrRc0YcIErrrqKgBq1arFihUrqFWrFt9++y3169cv+86KFKPEoe/uH7l7K3c/2t27uXsukYu084Eu7r492lpZdaIkSjvaW79+PccccwwJCYV/dP/617944IEHit132bJlHHfccSQlJZGbm0vt2rVxd+bPn88ZZ5xR8s5IiZjZfutr165l3LhxXHfddfTr14/JkycX+t0oqHbt2lxwwQX06tWLK664giuvvJLk5ORi63tNnjyZSy65JP935uqrr2bAgAGcfvrp/Pjjjxx//PGa8pO4KZMPZ7n7Vnef6O45Ja3FW2lHe4sXL2bVqlU0aNCAhg0bMmbMmPznnnvuOZo2bVrsvk8++SQDBw4EIqO9jRs3YmYsWLBAoV8OsrKyaNOmDcOGDePiiy/m0UcfZerUqcVuP3PmTNatW8fSpUtZsmQJmzdv5rXXXiu2DrBnzx6mTp1K796984+TlpbGypUrqVatGhs2bGDw4MG88sorh72/IhDCT+SWdrSXkJDA3XffzTfffMN7773HXXfdRU5OTpHHLujjjz+mRYsWVK9eHYBevXoxZMgQzj33XL7//ntatGhx0MCRspeYmEijRo3y14866ihq1apV7Pbz58+nRYsW+f+f09LSWLhwYbF1gJdffjl/Wqeg3NxcEhMTyc3NpUmTJmzbtq0suyZSrNCF/oFKOtrr3r07GRkZJCQk0Lx5c7p06cL7779/yPM8/fTT9O/fP389NTWVZcuWccwxx1CxYkUGDRrE2LFjy6JLEqVTTz2VxYsX8/XXX5OXl8fIkSPp3Llzsds3adKEadOm8dFHH7FgwQJGjRpF69ati63v3r2bWbNm0a1bt0LHmjRpEpdddhk1atTg22+/JTEx8XB2VSRfxfJuQHkr6Whv6tSptG/fnpSUyMcM1q1bV+T8fkEffPAB7du3L3Shds+ePfz888/88ssvGu2Vg6SkJEaNGkWvXr1Yv349HTp0YMSIEQCcfvrpjB8/nuOPPz5/+9///vfMnDmT7t27s2fPHvr06cM111yDuxdZHzNmDH379i3y3O5OYmIil112GVdffbXu1Ze4CX3oFxztpaamHnK09+mnn/L222/zyCOP8O6777J8+fKDbg8wduxYnn766UL1GTNm0L17d6ZOnarRXpzccMMN3HDDDfnrPXr0oEePHoW2K+rCaoUKFRg1ahSjRo0q9FxR9QsuuIDGjRsX2Y69twx37ty50O2dIodT6EO/pKO922+/nauuuoqUlBROPPFEJk2axME+RFbw/uwDbdiwgZ49e9KzZ0+6devGI488UvYdDKHGg8vlbuBi/LdUe68efmEZtUMkwg68X/1IkpaW5kV9svHX5JtvvqFBgwaHnAKSsnNkhX7pxBL6v5X+h7nvULoXfDNbFPxRzEJ+syP9I+t//pJS7a3RnoiUFQ0/RURCRKEvIhIiCn0RkRBR6IuIhIhCX0QkRBT6IiIhotAXEQkRhb6ISIgo9EVEQkShLyISIgp9EZEQiTn0zWykmV0cLD9rZvPM7J4Cz0dVExGR+Ikp9M2sI1DP3aea2WVABXc/E2hqZs2jrZVZL0REJColDn0zqwSMAVabWS8gHZgYPP0OcE4JaiIiEkexjPSvB5YCDwPtgQHA3q/+2QKkANWjrBViZv3MLMvMsjZt2hRD80REpDixhP4pwGh3zwFeAt4HqgbPHR0cc0eUtULcfbS7p7l72sG+kUpEREoultBfATQNltOAxuybqmkLrAYWRVkTEZE4iuWbs54FnjOzK4FKRObqp5jZcUAPoAPgwJwoaiIiEkclHum7+/fu/nt37+TuZ7r7GiLBPx/o4u7b3T03mlpZdUJERKJTJt+R6+5b2XdnTolqIiISP/pErohIiCj0RURCRKEvIhIiCn0RkRBR6IuIhIhCX0QkRBT6IiIhotAXEQkRhb6ISIgo9EVEQkShLyISIgp9EZEQUeiLiISIQl9EJEQU+iIiIaLQl8Nux44dLFu2rLybISIo9ENt6NChNGjQgAYNGvD8888Xu92ePXuoWbMm9erVo169epx55pmFtrn//vsZMmRIkfs/9dRTAGRmZtK0aVNWrlwJwIQJE0rfCREpEYV+SH3++efMnj2bNWvWMG/ePP7+97+zYcOGIrf98ssv6dChAzk5OeTk5DBv3rz9nl++fDnDhw8vct+tW7eSk5PDSSedxJtvvskjjzzCtGnTWLJkCcccc0yZ90tEDk6hH1LLli0jLS2NChUq0LBhQ1JTU8nJySly28WLF9OuXbsin3N3+vfvz6WXXlrk85mZmWRkZORvm5SURG5uLjNnzqRLly5l0xkRiVrMoW9mKWa2OFh+1szmmdk9BZ6Pqiblo2XLlkyZMoX169czZ84cNm/ezEknnVTktosXL+aVV16hXr16tG7der+R/jPPPENqairdu3cvtN+GDRvYuXMnTZo0ASAhIYFNmzaRkJBAYmLi4emYiBxUaUb6jwJVzewyoIK7nwk0NbPm0dZK33yJVcuWLWnYsCEXXnghffv2ZdCgQVSuXLnIbVNSUhgzZgw5OTkMGTKEm266CYiE+mOPPUZmZmaR+2VmZnLrrbfmr19xxRUMGTKEXbt2kZ2dzamnnkpubm7Zd05EihVT6JtZV+AHIAdIByYGT70DnFOCWlHH7mdmWWaWtWnTpliaJ1EYO3YsTZo04eOPP+aLL77gjTfeYOHChUVue/vtt3P++ecD0KdPHzZu3MjGjRvJyMhg6NCh1K5du9A+33zzDRUrVuTYY4/Nr/Xv359ly5ZRp04d/vOf/3DVVVcxe/bsw9NBESlSiUPfzCoD9wKDg1J1YF2wvAVIKUGtEHcf7e5p7p6WnJxc0uZJlObPn0/Lli2ByLRLu3btyMrKKnLbMWPG5C//9NNPbNmyhYSEBKZPn84tt9xCvXr1yMjI4NFHH82fv8/MzOSvf/1roWPNnDmT9PR03J06deqwbdu2w9A7ESlOxRj2GQyMdPdtZgawA6gaPHc0kReSaGtSTpo0acKECRNIT09n69atjB8/nvHjxxe57YQJE6hWrRq9evXiH//4B2eccQZ169bl+++/z99m7NixrF69miFDhvDVV19Rp04d6tSpU+hYn332GRkZGZgZ27dvp2nTpoetjyJSWCzBex4wwMxmAe2Ai9k3VdMWWA0sirIm5WTAgAHUr1+fTp060adPH2688UYaN25MmzZtCm07YsQIHn30UerXr8+SJUuKfXHY64knnuDmm28uVN+6dSupqakAdO/endGjR9OxY8ey6ZCIRKXEI31377R3OQj+S4A5ZnYc0APoAHiUNSknNWrU4NVXXy1U/+yzzwrVTjzxRBYvXnzQ491www35y7fffnuRd+ckJSXRu3dvAO644w7uuOOOErZaREorlumdfO6eDmBm6UA34GF3316SmhwejQdPL+cW/LdMjrJ6+IVlchwRiShV6O/l7lvZd2dOiWoiIhI/upgqIhIiCn0RkRBR6IuIhIhCX0QkRBT6IiIhotAXEQkRhb6ISIgo9EVEQkShLyISIgp9EZEQUeiLiISIQl9EJEQU+iIiIaLQFxEJEYW+iEiIKPRFREJEoS8iEiIKfRGREClx6JtZTTN728zeMbPJZlbZzJ41s3lmdk+B7aKqiYhI/MQy0r8G+Ke7nw/kAFcCFRfi580AAAQnSURBVNz9TKCpmTU3s8uiqZVVJ0REJDol/mJ0dx9ZYDUZuBbIDNbfAc4BTmHfF6AfrPZVyZssIiKxinlO38zOBJKAtcC6oLwFSAGqR1kr6rj9zCzLzLI2bdoUa/NERKQIMYW+mdUGngT6AjuAqsFTRwfHjLZWiLuPdvc0d09LTk6OpXkiIlKMWC7kVgZeBe509zXAIiJTNQBtgdUlqImISByVeE4f+ANwKnC3md0NPA9cZ2bHAT2ADoADc6KoiYhIHJV4pO/u/3L3JHdPDx4vAOnAfKCLu29399xoamXVCRERiU4sI/1C3H0r++7MKVFNRETiR5/IFREJEYW+iEiIKPRFREJEoS8iEiIKfRGREFHoi4iEiEJfRCREFPoiIiGi0BcRCRGFvohIiCj0RURCRKEvIhIiCn0RkRBR6IuIhIhCX0QkRBT6IiIhotAXEQkRhb6ISIjEPfTN7Fkzm2dm98T73CIiYRfX0Dezy4AK7n4m0NTMmsfz/CIiYRfvkX46+74Y/R3gnDifX0Qk1Mzd43cys2eBJ9z9UzM7HzjV3YcfsE0/oF+weiLwRdwaGJu6wObybkQ5Ud/DK8z9/zX0PdXdk4t6omKcG7IDqBosH00R7zTcfTQwOp6NKg0zy3L3tPJuR3lQ38PZdwh3/3/tfY/39M4i9k3ptAVWx/n8IiKhFu+R/hvAHDM7DugBdIjz+UVEQi2uI313zyVyMXc+0MXdt8fz/IfJr2Yq6jBQ38MrzP3/Vfc9rhdyRUSkfOkTuSIlZGa1zaybmdUt77aIlJRCvxTMLMXM5pR3O+LNzGqa2dtm9o6ZTTazyuXdpngxsyRgGtAemGlmRd4W91sX/O4vLu92xJOZVTSzb8xsVvA4ubzbFAuFfoyCf/wvANXLuy3l4Brgn+5+PpADdC/n9sRTG+A2d/8H8G/g1HJuT3l5lH23X4dFG2C8u6cHjyXl3aBYKPRjlwdcAeSWd0Pizd1Huvu7wWoysLE82xNP7j7b3eebWScio/155d2meDOzrsAPRF7ww6QDcJGZfRT8DbF43/1YJhT6MXL33N/I3UcxM7MzgSR3n1/ebYknMzMiL/hbgV3l3Jy4Cqby7gUGl3dbysFC4Dx3bw9UAnqWc3tiotCXmJhZbeBJoG95tyXePGIA8BlwSXm3J84GAyPdfVt5N6QcfObu64PlLOBX+QcjFfpSYsFo71XgTndfU97tiScz+7uZXR+s1gLCFn7nAQPMbBbQzsyeKef2xNOLZtbWzCoAvYFPy7tBsdB9+qVkZrPcPb282xFPZvZnYBj7fun/5e4TyrFJcRNcwJ8IVAE+BwZ4SP8Rhe1338xaAy8DBkxx97vLuUkxUeiLiISIpndEREJEoS8iEiIKfRGREFHoi4iEiEJfRCREFPoiIiGi0BcRCZH/D1RO84bbSt7yAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 四字评论对应的分数分布\n",
    "x = np.arange(1, 6)\n",
    "nums = [\n",
    "    len(dataset[dataset['comment_length'] == 3][dataset['star'] == i])\n",
    "    for i in x\n",
    "]\n",
    "plot_score_distribution(x, nums)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:34:20.625730Z",
     "start_time": "2020-05-14T01:34:20.616568Z"
    }
   },
   "source": [
    "> 5% 的影评字数少于 3 个，删除这些数据；删除后对类别分布无影响\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 删除字数少于 3 的评论\n",
    "dataset = dataset[dataset['comment_length'] > 3]\n",
    "print(dataset.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 影评的词数分布\n",
    "> 已经分词处理过文本，基本单位变成单词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:40:30.846545Z",
     "start_time": "2020-05-14T01:40:28.869640Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "min num of words:0, max num of words:1941, median num of words:12.0,\n",
      "mean num of words:19.212695313318388, mode num of words:4\n"
     ]
    }
   ],
   "source": [
    "def get_words_num(s):\n",
    "    words = s.strip().split()\n",
    "    return len(words)\n",
    "\n",
    "\n",
    "dataset['words_num'] = dataset['cleaned_comment'].apply(get_words_num).astype(\n",
    "    'int')\n",
    "\n",
    "min_ = dataset['words_num'].min()\n",
    "max_ = dataset['words_num'].max()\n",
    "median = dataset['words_num'].median()\n",
    "mean = dataset['words_num'].mean()\n",
    "mode = dataset['words_num'].mode()[0]\n",
    "print(\n",
    "    f\"min num of words:{min_}, max num of words:{max_}, median num of words:{median},\\nmean num of words:{mean}, mode num of words:{mode}\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:41:30.008937Z",
     "start_time": "2020-05-14T01:41:29.995914Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>comment</th>\n",
       "      <th>star</th>\n",
       "      <th>cleaned_comment</th>\n",
       "      <th>comment_length</th>\n",
       "      <th>words_num</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>681</th>\n",
       "      <td>……</td>\n",
       "      <td>1</td>\n",
       "      <td></td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2242</th>\n",
       "      <td>= =</td>\n",
       "      <td>5</td>\n",
       "      <td></td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2646</th>\n",
       "      <td>。。。</td>\n",
       "      <td>5</td>\n",
       "      <td></td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3678</th>\n",
       "      <td>。。。。。。</td>\n",
       "      <td>2</td>\n",
       "      <td></td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3757</th>\n",
       "      <td>？？</td>\n",
       "      <td>1</td>\n",
       "      <td></td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2376705</th>\n",
       "      <td>……</td>\n",
       "      <td>3</td>\n",
       "      <td></td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2378886</th>\n",
       "      <td>。。。。</td>\n",
       "      <td>5</td>\n",
       "      <td></td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2381227</th>\n",
       "      <td>！</td>\n",
       "      <td>5</td>\n",
       "      <td></td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2381900</th>\n",
       "      <td>★★★★★★★★★★★★★★</td>\n",
       "      <td>5</td>\n",
       "      <td></td>\n",
       "      <td>15</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2382251</th>\n",
       "      <td>。。。</td>\n",
       "      <td>5</td>\n",
       "      <td></td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3660 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 comment  star cleaned_comment  comment_length  words_num\n",
       "681                   ……     1                               2          0\n",
       "2242                 = =     5                               3          0\n",
       "2646                 。。。     5                               3          0\n",
       "3678              。。。。。。     2                               6          0\n",
       "3757                  ？？     1                               2          0\n",
       "...                  ...   ...             ...             ...        ...\n",
       "2376705               ……     3                               3          0\n",
       "2378886             。。。。     5                               5          0\n",
       "2381227                ！     5                               2          0\n",
       "2381900   ★★★★★★★★★★★★★★     5                              15          0\n",
       "2382251              。。。     5                               4          0\n",
       "\n",
       "[3660 rows x 5 columns]"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset[dataset['words_num'] == 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:48:40.998658Z",
     "start_time": "2020-05-14T01:48:40.906974Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/yangbin7/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  after removing the cwd from sys.path.\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXMAAAEGCAYAAACXVXXgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3de3hU1b3/8fc3XCWAhACJgShQL4AGBIMVSyXYUkstxfJT0aNF5fgAFiziDTxqtYLWy7GNQhFRG3o4yg+KAorSqjVQjtwCSIOaUohyv5hwSQgHMML3/DHDkJArECay+byeZx5mf/dea9bKTD7ZWbMnmLsjIiKnt5jaHoCIiJw8hbmISAAozEVEAkBhLiISAApzEZEAUJiLHAczs3JqdWtjLCIlKczlW8XMWphZTPh+WzO7MHy/7rGhaSENKuinQ/jf9mZ2dQXHjDCzhmb2kZmlmNkDZtbUzF42s6sqGOIVZvbBMbUFZtatinlVOZ5jjo8xswVmdl5Vx4qAwlyizMx+b2afmdn88G2Vmb0S3hcHzAduDh/eHviLmZ0L/Duw3MxWmNluM1sDrABWmFmTYx7jWmBm+Czagclm1qic4dQFHgG+AWKBm9y9ELga2FzBFFKAj0s8VjOgBfBJJXOu7nhK6gs0cvcNVRwnAoRezCLRdJjQScSR114d4GszOx+YCIx0978BuPtHZvYo0B14BZju7nvMbDaQDiwAUtx975HOw2f1jwEPe+gTcV+a2Xvh44eUOK4hsJVQuMYBacDfw2fkjd39i/BxDdz9YPj+O+Gx7Dez64BxQFOgGbA2vAKT6O6NT2A8PYBpQBGhHy4XAtvMbFWJr52Fv27D3X1+9b/kciYwfQJUosnMegIXAKnhUhawCagPrAT2AI8CvwG+CQcg4fAc5O4DSoR5MfD/gQvdfX/4uAeBPu7ep8RjNgaWAH8F7nd3N7OmwFPA5YQCeiqwC0ggdGa+CTgX2Atc6u57zexz4MrwD5RxQC5wOzDE3deEHyvX3b9T4rGrNZ5jvkY3A4PdvY+ZvQr8yt3/9wS+3HIG0TKLRI2Z/RL4HXAb8AOgD6EwfBLo6u47gN8DOwiF+T/NrDgcfnOAzkfWnsOGA0+VCPIfAPdQ4owXwN2LCAX0DwmdfV8E7Ae+ALYACwn9ENkNfAeY4O6pwEeEll6OnPkfPmZKvULdh4L82GOOczxH2rQFxpZo05vQbw8ildIyi0SNu080sxzgsmN2ferufzGzu4H67j4+XP8PM1sPHCR04nEbkFei3VzgLTOrAzQCJhM6q19sZmcRWpYoDB/bnFCwdgcOAW2AfOBB4AlCSyHXAfVKjO9cQmffFVkGvGpmr7n7v4drhwHC6/jHMx7M7BzgXeA8YFZ42aYNsMTMPDyeX7j7u5WMSc5QWmaRqDKz54EGwPJwqQPQhNASR2fgekJB91V4OWQ9cD7wHtCSUPB9B/iK0BJIfeDP7j7WzOq7+9fhx3kW2OLuL4S3/0LoLP7v4e3LgFcJrXfHEFpWKQZ+QuhN2B8Df3f3lBJj/5TQWfIhIBEY4+5TzGwJ8Ji7/9XM1rj7ReHjj2c8XYC3gBeAB929Tbi+DrjE3Q+Y2RTgdXc/9moaES2zSNQdIrR0MAS4i9BVGwZMAP4foXD+gFCYHhEPjHP3ruHljwXAUHdPdffO7j4W4EhwhvUClpbYbk2JK1TcfQXQEygg9Gbjs8Ca8JLNW8BsQsssx/q+u19K6AfBEc8Dg8JvqhaVeIxqjyc8jv9w9xfLecySdPYl5VKYS7T9FvgFofDMAR4A7g2vl59FaJnhL+4+L3x8XWAe0Kq6D2Bm/YE67r6kRPkcQuvjJSUR+g3hL8DLhC51hFCYXwW8U82HnAUMBs7m6DLKcY3H3de7+/TwZpnvSzOrT+iH2jfVHJOcYRTmEjXh68hnElpaiSG0vnwJcJ+ZXQosAt519wfDx19EaA17qLvPLNlV+FbeY/QndInjXeHtpmZ2JXDgyCWGJawjdDVMA+Bh4GIzGwBMB+4ndD3490scXw9YGL5c8M7wNuF5GKHfJtadxHiOKPlBqLrhx1lDaM08u4I2cqZzd910i8oN6EjoCo52wHbgznD9ZkIheHU5bc4up/YX4Mfl1F8APgMuK1F7EPiU0GWNJY9tBGQSunomMVy7FfiQ0KWOELr2/AMgNrx9J9AgfL9LiePOIXQ2/yDQ9ETGc8zYGpS4v4XQdfD1avv50+3bfdMboFIrzKyxhy7RO/IBnjruvu8k+2xC6Iy3uCbGeLK+beORYFOYi4gEgNbMRUQCQGEuIhIAtfIJ0BYtWnjbtm1r46FFRE5bK1asyHf3luXtq5Uwb9u2LcuXL6/6QJHTQFFREZs2baJjx461PRQJODOr8E8ia5lFTntPP/00Z599Ns2aNeP5558vtS8jI4Pbb7+9yj7++te/0rdvX/r168eyZcsi9bvvvpvY2FgSEhJ44403ym07YcIEANLT02nfvj25uaE/5zJ9+vRyjxc5FRTmclpbvHgx06ZN45NPPmHRokX85je/YdOmTQDk5eXxwAMPVNnHF198wciRIxk3bhx33HEHAwYMwN2ZNm0aOTk55ObmMmPGDIYNG8bBg6U/57N79262b99Ox44dmTNnDs899xxz585l9erVtGpV7Q+tipw0hbmc1ho1asQbb7xB+/bt6dSpE23btmXr1q0AjBo1ij59+lTRA+zYsYPx48dz2WWXMWDAAA4cOEBRURHJyclkZGSQmJhIr169MDP27NlTqm16ejojR44EQh/Ai4uLo7CwkMzMTHr37l3zExapgMK8hhQVFZGTk1PbwzjjdOnShYsvvhiArVu3smnTJi655BI++OADNm/ezJAhQ6roAXr06EGfPn0oLi6OhHqTJk3o2bMnycnJACxbtozmzZuTkJAQabdjxw72799Pu3btAIiJiSEvL4+YmBiaNm16CmYrUrHAhHl566ZZWVn079+fa665hnnz5lXavqCggBtvvJGkpCSuuOIKPvvss8i+itZTS9K6ae17+OGHGTp0KDExMdx777288sorhP8meLWMHj2a++67j7vvvrvMvjFjxjB69OhStfT0dEaNGhXZHjhwII8//jjFxcVs3bqVbt26UVhY5u9uiZwatfE3BC677DKvSYsWLfLOnTt7bm6uf/bZZ96kSRNfv369X3DBBf7RRx95Zmamt2zZ0nft2lVhH/fdd58/9NBDfvjwYZ89e7ZfeOGF7u6em5vrF110kS9fvtzffPNNb926tR8+fLhU2127dvnIkSPd3T0tLc1nzpzp6enpnp2d7R999FGNzlXK99577/n555/vRUVFPnr0aH/mmWfc3T0zM9Nvu+22aveTnZ3tLVu29MLCwkht4sSJ/r3vfc8PHToUqW3YsMEfeeSRcvt48cUXvXfv3v7ss8/622+/fWITEikHsNwryNVAhPmqVav8008/jWynpKT4/Pnzffr06ZFat27dPDs7u8I+2rRp45s3b45sN27c2PPy8nzRokX+/vvvR+rx8fGlvtHd3X/961/7F1984e7uvXr18r/97W/+xBNP+AsvvHDSc5Oqbdy40ZOSknzp0qXu7t6pUydv1aqVJyQkeFxcnDds2NB//vOfV9j+k08+8XXr1kW2u3TpEnk9ffLJJ56YmBh5fo8YNWqU5+fnl+nro48+8uzsbE9LS/PXXnvN/+u//qsmpiji7pWHeSD+27guXbpE7h9ZN01NTSU2NpZDhw4xe/ZsiouL6dSpU6X95Ofn07p1a9avX09xcTFNmjShR48eABQXFzNp0qTIeuoRWjetXQcPHuRnP/sZDzzwAJdffjlAqSWy+fPnM2XKFKZMmVJhH9nZ2WRkZDBv3jxyc3PZvn07F1xwATt37qRfv35MnDgx8vwCrF27lvj4eOLj48vta+TIkZgZBQUFtG/fvuYmK1KZilL+VN5q+sy8pNtvv91Hjx4d2X7++ee9YcOGPmHChErbjR492jt16uRPP/20d+zY0W+44YZS+0eNGuX16tXzd955p1R9zJgxvnXr1sj2pEmTvEOHDv7YY4/5k08+6V27dvWCgoIamJmUZ/bs2Q54QkJC5DZz5szI/pLLLBs3bvSUlJQyfRw+fNhHjBjhrVq18osvvtgzMzPd3T09Pd1jYmJK9b106VIfMWJEuc/prl27fNasWe7u/swzz3iHDh18586dNT9pOWNRyZl5rfzVxNTUVD8VnwCdN28ev/rVr1i1ahWxsbGR+ubNm/nud7/LggULOP/888tte/jwYaZOncrSpUt56aWXyMrKIjU1tdQxq1ev5gc/+AG5ubk0adKEjRs38sorrzB27Ngy/Y0fP55Zs2bRt29fOnToQL9+/Wp2smegtmO+Hf+P8TcFO6h7dkLVB1Zi/dPX1tBo5ExiZis89F8nlhGYq1k2bdrEnXfeyeuvv05sbCzr1q0jOzv0n7K0adOG7t27s2bNmgrbx8TEcNttt9GxY0f69+8fCfJVq1ZFrkxJSUkhKSmJjRs3AqGrGe65554yfWVmZpKWloa7Ex8fX+baZDm9nWyQi5wKgQjz8tZNt27dysCBA9mzZw/btm0jKyuLrl27VtrP119/TXp6OuPGjYvUsrOzufPOOzlw4ACfffZZZD21qnXTlJSUyLqp1s5F5FQ77ZZZyvtV+3/XLiHvrXHExDaL1Jr3uYtvdm1h74p3sHoNadbrNmI79OSbwjy+mvkbkgZPKNPP3pXvcnDbv2hx7dFrh92d3R++zL5//g91GjWleZ9hNDy3M7s+mESzqwYR06BRqT4OHSji4MbVNLqwBwVLZ1KU/SGJtz5HnbOaHPtw+lX7OH1blllqgp57ORGVLbMEIsxrg9ZNo+/b8tzXBD33ciLOiDXzaNO6qYh8myjMRUQCQGEuIhIACnMRkQBQmIuIBIDCXEQkABTmIiIBoDAXEQkAhbmISAAozEVEAkBhLiISAApzEZEAUJiLiARApWFuZnFm9p6ZLTezl8O118xssZk9UuK4MjUREYmeqs7MfwG8Hv6Ti03M7EGgjrv3ANqb2QVmNuDY2ikes4iIHKOqMN8JXGJmzYBkoB0wI7zvfaAnkFZOTUREoqiqMP8f4DzgV0AOUB/YEt63C0gAYsuplWFmQ8LLNcvz8vJOdtwiIlJCVWH+GDDM3Z8A/gn8G3BWeF/jcPuicmpluPtkd09199SWLVue9MBFROSoqsI8DkgxszrAd4GnObqM0gVYD6wopyYiIlFUt4r9vwUyCC21LAZ+Dyw0sySgL3AF4OXUREQkiio9M3f3Ze5+sbs3dvc+7l5I6A3PJUBvdy8or3aqBy0iIqVVdWZehrvv5ujVKxXWREQkevQJUBGRAFCYi4gEgMJcRCQAFOYiIgGgMBcRCQCFuYhIACjMRUQCQGEuIhIACnMRkQBQmIuInKDDhw+zcuXK2h4GoDCXGlBUVEROTk5tD0POUK+//jp33HFHZPvVV1/lvPPOIyEhgaeeeqrStl9//TVDhw6lTZs2pKSksHDhwsi+rKws+vfvzzXXXMO8efPKbf/GG2+Qn5/Pm2++SevWrVm8eDEA06dPr4GZHR+FeUCczAsa4IknnqBNmza0adOGjIyMSH3SpEkkJiaSmprKl19+WW7bCRMmAJCenk779u3Jzc0FaucFLWeWuXPnMnz4cNwdgJ07d/KHP/yBnJwc/vnPf/KnP/2J5cuXV9h+8uTJ7Nu3j/Xr15ORkcGNN97I3r17KSgo4JZbbuGee+7hoYce4rbbbmP37t2l2n7zzTdkZmbyox/9iKlTpzJ16lRmzJhBfn4+hw8fPqXzLo/CPABO9gX96aefsmDBAjZs2MDixYsZPXo0O3bsYPXq1YwdO5aVK1cyfvx4RowYUabt7t272b59Ox07dmTOnDk899xzzJ07l9WrV9OqVatTNmcRgIyMDB599NHI9hdffMGFF15Io0aNiIuLo3PnzmzZsqXC9tOnT+euu+6ibt26pKam0qJFC9asWcO2bdsYN24cvXv3Ji0tjeTkZDZv3lyq7R//+EcGDx4MQGFhIeeeey6FhYW89dZbXHfddadmwpVQmAfAyb6gc3JySE1NpU6dOiQnJ3Peeeexfft2Zs+ezaBBg0hKSqJHjx7k5+ezb9++Um3T09MZOXIkAO5OXFwchYWFZGZm0rt371MzYZGwmTNnEh8fH9n+zne+w6JFi/jXv/7F559/zpIlS7jyyisr7SM/Px+AvXv3smHDBlq0aEGHDh248cYbOXToEG+++SbFxcV06tQp0ubAgQOsWLGC733vewA0a9aMdevW0aRJE4qLiznrrLPKfaxTSWEeACf7gu7UqRNvv/0227ZtY+HCheTn59OxY0c2b95M586dI8clJSWxYcOGyPaOHTvYv38/7dq1AyAmJoa8vDxiYmJo2rTpKZipSGlmVmq7efPmXHPNNfTv35+BAwdy0003Udl/U3nLLbcwYsQInnnmGfr06UOHDh1o27ZtZP8LL7zArbfeytChQ6lTp06kPmnSJIYNGxbZHjx4MMOHDycuLo79+/fToUMH1q9fX2PzrA6FeQCc7Au6U6dOJCcnc+211zJ48GDuv/9+6tevz6FDh0qFcmxsLHv27Ilsp6enM2rUqMj2wIEDefzxxykuLmbr1q1069aNwsLCGpypSOUyMzPZsmULn3/+OatXryY/P5+ZM2dWePywYcN48cUXycvLY+XKlYwePbrU/nvvvZe1a9fy1FNPsW7dOiD0hv+6devo2rVr5Lif/vSn5Obm0qxZMz7++GPuv/9+3nnnnVMzyQoozAPoeF/QU6ZMoV27dqxcuZI1a9Ywe/ZssrKyiIuLKxXe+/fvJyYm9JLZuHEjdevW5ZxzzonsHzp0KDk5OcTHx/Phhx9y8803s2DBglM3UZFjLFmyhA4dOkROcFJTU8nKyqq0zc9//nN69erFxRdfzIABAwBYt24d2dnZALRp04bu3buzZs0agArfP8rNzeX888+noKCAdu3alfreiQaFeQAd7wt6yZIlkfXAmJgYLr30UpYvX05qamrkUit3Z+XKlbRu3RoInZXfc889ZfrKzMwkLS0Ndyc+Pj7qL2g5s7Vr1465c+eybNkyli5dyssvv8wll1xSZbvf/va3PPXUU5Hvma1btzJw4ED27NnDtm3byMrKomvXruzevZuvvvqKDh06lOlj3rx59O3blyZNmrB58+aoLzUqzAPoeF/Q7dq1Y/r06fzjH/9g/vz5TJs2jUsuuYS+ffsya9Ys3nrrLcaOHUt8fDzJycmsXbuW+Pj4Uuv0R2RnZ5OSkoKZUVBQoLVziaobbriBq6++mh//+Mdcc801XH755dxyyy0AJCYmcvDgwTJt3nvvPerXr0/fvn0jtauuuopBgwbRsWNHvv/975Oenk5SUlKpN/xLOrIkGRMTw0033cSvf/1rfvKTn5y6iZbDjlzOFk2pqale2aVylWk75t0aHk3tWf/0tTXW15QpU5g/fz5Tpkzh0KFD/PKXv+TPf/4zhw8f5vrrr2fy5MnExMSQmJjIhg0baNCgQaTt3r17GTx4MO+//z716tXjrrvuYuzYsQAsXryYBx54gAYNGjBx4kQuuugi7r77bp588skyQb17924WLFjAddddx7PPPktGRgYff/wxzZs3r5E56rk/c31bnvtvCnZQ9+yEk+rjZJ57M1vh7qnl7lOY154TeVK/DfOviRc0HP/8vw1zrykK8+Oj5z6ksjDXMosct5oIchGpWQpzEZEAUJiLiASAwlxEJAAU5iIiAaAwFxEJAIW5iEgAKMxFRAJAYS4iEgAKcxGRAFCYi4gEgMJcRCQAFOYiIgGgMBcRCQCFuYhIACjMRUQCoNphbmYTzaxf+P5rZrbYzB4psb9MTUREoqNaYW5m3wcS3f0dMxsA1HH3HkB7M7ugvNopHLOIiByjyjA3s3rAK8B6M+sPpAEzwrvfB3pWUBMRkSipzpn5IOBz4FngcmA4sCW8bxeQAMSWUyvFzIaY2XIzW56Xl3ey4xYRkRKqE+Zdgcnuvh34b+DvwFnhfY3DfRSVUyvF3Se7e6q7p7Zs2fKkBy4iIkdVJ8zXAe3D91OBthxdRukCrAdWlFMTEZEoqVuNY14D/mhmNwH1CK2Pv21mSUBf4ArAgYXH1EREJEqqDHN33wvcULJmZmlAH+BZdy+oqCYiItFRnTPzMtx9N0evXqmwJiIi0aFPgIqIBIDCXEQkABTmIiIBoDAXEQkAhbmISAAozEXkhBUVFZGTk1PbwxAU5iKnvddff5077rijTD0zM5PevXtX2X7SpEkkJiaSmprKl19+GanffffdxMbGkpCQwBtvvFFu2wkTJgCQnp5O+/btyc3NBWD69OknMhU5CQpzkdPY3LlzGT58OO5eqn7w4EGGDh1apn6s1atXM3bsWFauXMn48eMZMWIEANOmTSMnJ4fc3FxmzJjBsGHDOHjwYKm2u3fvZvv27XTs2JE5c+bw3HPPMXfuXFavXk2rVq1qdqJSJYW5yGksIyODRx99tEx97NixXHbZZVW2nz17NoMGDSIpKYkePXqQn5/Pvn37SE5OJiMjg8TERHr16oWZsWfPnlJt09PTGTlyJADuTlxcHIWFhdX+jUBqlsJc5DQ2c+ZM4uPjS9U+//xz5syZw2OPPVZl+82bN9O5c+fIdlJSEhs2bKBnz54kJycDsGzZMpo3b05CwtG/bL1jxw72799Pu3btAIiJiSEvL4+YmBiaNm1aE1OT46QwFzmNmVmpbXdn6NCh/OEPf6Bhw4ZVtj906FCp8I2NjS1zBj5mzBhGjx5dqpaens6oUaMi2wMHDuTxxx+nuLiYrVu30q1bNwoLC09kSnKCFOYiAfLyyy+TkpLCVVddVa3j4+LiSoX3/v37iYk5GgsvvfQSX3/9NUOGDInUNm7cSN26dTnnnHMitaFDh5KTk0N8fDwffvghN998MwsWLKiBGUl1KcxFAuTtt99m5syZJCYm0r17dxYtWkT37t0rPD41NZXFixcDobP6lStX0rp1awBWrVrFE088wdSpU0sFfHp6Ovfcc0+ZvjIzM0lLS8PdiY+PL3OGL6eWwlwkQN577z2++uortm/fTlZWFldeeSVZWVkVHt+3b19mzZrFW2+9xdixY4mPjyc5OZmdO3fSr18/Jk6cGFkXB1i7di3x8fFl1ukBsrOzSUlJwcwoKCjQ2nmUndCfwBWR6Gs75t1y60Wr/8GBjZuZf8z+bwp2kP/Fzki7TRNupc2wDKxuvVLHHb56FP/2q0exunVp3ueXtB3zLoXL57B7y1au/8VgYDAArQY8yr7PMml21SBeOeaxDh0o4uDGPH6/7V0K/DwWjvsdibc+x8jFZce8/ulrT/RLIJVQmIuc5hqn/JDGKT8sU697dgKJ//Z0ZDt5xH+X275B644k3vpsqVrT1P40Te1f5tg6sc2IadCobL1hYxpd2AOAs797PWd/9/rjmoOcPC2ziEi11T07oeqDpFYozEVEAkBhLiISAApzEZEAUJiLiASAwlxEJAAU5iIiAaAwFxEJAIW5iEgAKMxFRAJAYS4iEgAKcxGRAFCYi4gEgMJcRCQAFOYiIgGgMBcRCQCFuYhIACjMRUQCQGEuIhIACnMRkQBQmIuIBIDCXEQkAKoV5maWYGafhO+/ZmaLzeyREvvL1EREJHqqe2b+n8BZZjYAqOPuPYD2ZnZBebVTNVgRESlflWFuZlcD+4DtQBowI7zrfaBnBTUREYmiSsPczOoDjwJjwqVYYEv4/i4goYJaeX0NMbPlZrY8Ly/vZMctIiIlVHVmPgaY6O57wttFwFnh+43D7curleHuk9091d1TW7ZseXKjFhGRUqoK8x8Cw81sPnAp0I+jyyhdgPXAinJqIiISRXUr2+nuVx25Hw70nwELzSwJ6AtcAXg5NRERiaJqX2fu7mnuXkjoDc8lQG93LyivdioGKiIiFav0zLw87r6bo1evVFgTEZHo0SdARUQCQGEuIhIACnMRkQBQmIuIBIDCXEQkABTmIiIBoDAXEQkAhbmISAAozEVEAkBhLiISAApzEZEAUJiLiASAwlxEJAAU5iIiAaAwFxEJAIW5iEgAKMxFRAJAYS4iEgAKcxGRAFCYi4gEgMJcRCQAFOYiIgGgMBcRCQCFuYhIACjMRUQCQGEuIhIACnMRkQBQmIuIBIDCXEQkABTmIiIBoDAXEQkAhbmISAAozEVEAkBhLiISAApzEZEAUJiLiASAwlxEJAAU5iIiAVBlmJvZ2WY2z8zeN7NZZlbfzF4zs8Vm9kiJ48rUREQkOqpzZn4L8Dt3/xGwHbgJqOPuPYD2ZnaBmQ04tnbqhiwiIseqW9UB7j6xxGZL4FYgPbz9PtAT6ArMOKa2tmQ/ZjYEGAJw7rnnntSgRUSktGqvmZtZDyAO2ARsCZd3AQlAbDm1Utx9srununtqy5YtT2rQIiJSWrXC3MyaA+OBwUARcFZ4V+NwH+XVREQkSqrzBmh94M/AQ+6+AVhBaBkFoAuwvoKaiIhESZVr5sC/A92Ah83sYSAD+IWZJQF9gSsABxYeUxMRkSipzhugLwEvlayZ2dtAH+BZdy8I19KOrYmISHRU58y8DHffzdGrVyqsiYhIdOiNShGRAFCYi4gEgMJcRCQAFOYiIgGgMBcRCQCFuYhIACjMRUQCQGEuIhIACnMRkQBQmIuIBIDCXEQkABTmIiIBoDAXEQkAhbmISAAozEVEAkBhLiISAApzEZEAUJiLiASAwlxEJAAU5iIiAaAwFxEJAIW5iEgAKMxFRAJAYS4iEgAKcxGRAFCYi4gEgMJcRCQAFOYiIgGgMBcRCQCFuYhIACjMRUQCQGEuIhIACnMRkQBQmIuIBIDCXEQkABTmIiIBoDAXEQmAGg1zM3vNzBab2SM12a+IiFSuxsLczAYAddy9B9DezC6oqb5FRKRyNXlmngbMCN9/H+hZg32LiEglzN1rpiOz14AX3f0fZvYjoJu7P11i/xBgSHjzImBNjTzwqdMCyK/tQdSSM3nucGbPX3P/djvP3VuWt6NuDT5IEXBW+H5jjjnrd/fJwOQafLxTysyWu3tqbY+jNuEoOJEAAAJMSURBVJzJc4cze/6a++k795pcZlnB0aWVLsD6GuxbREQqUZNn5rOBhWaWBPQFrqjBvkVEpBI1dmbu7oWE3gRdAvR294Ka6ruWnDZLQqfAmTx3OLPnr7mfpmrsDVAREak9+gSoSJiZNTezPmbWorbHInK8FOblMLMEM1tY2+OINjM728zmmdn7ZjbLzOrX9piixczigLnA5UCmmZV7+VfQhV/7n9T2OKLJzOqa2UYzmx++pdT2mE6EwvwY4W/qPwGxtT2WWnAL8Dt3/xGwHfhxLY8nmjoD97r7k8BfgW61PJ7a8p8cvcT4TNEZmObuaeHb6toe0IlQmJd1CBgIFNb2QKLN3Se6+wfhzZbAV7U5nmhy9wXuvsTMriJ0dr64tscUbWZ2NbCP0A/yM8kVwE/NbFn470vV5FV+UaMwP4a7FwbgSpyTYmY9gDh3X1LbY4kmMzNCP8h3A8W1PJyoCi+pPQqMqe2x1IIs4IfufjlQD/hJLY/nhCjMpRQzaw6MBwbX9liizUOGA9nAz2p7PFE2Bpjo7ntqeyC1INvdt4XvLwdOyz8SqDCXiPDZ2Z+Bh9x9Q22PJ5rMbLSZDQpvNgPOtFD7ITDczOYDl5rZq7U8nmiaamZdzKwOcB3wj9oe0InQdeYVMLP57p5W2+OIJjO7C3iKoy/ml9x9ei0OKWrCb3zPABoAnwLD/Qz95jjTXvtmdgnwBmDA2+7+cC0P6YQozEVEAkDLLCIiAaAwFxEJAIW5iEgAKMxFRAJAYS4iEgAKcxGRAFCYi4gEwP8Bh1yRbq/i0vAAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "x = np.arange(1, 6)\n",
    "zero_words = dataset[dataset['words_num'] == 0]\n",
    "nums = [\n",
    "    len(zero_words[dataset['words_num'] == 0][zero_words['star'] == i]) for i in x\n",
    "]\n",
    "plot_score_distribution(x, nums, gap=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> **这一部分纯标点符号的评论怎么处理?**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:49:58.913705Z",
     "start_time": "2020-05-14T01:49:58.827072Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 2382890 entries, 0 to 2386552\n",
      "Data columns (total 5 columns):\n",
      " #   Column           Dtype \n",
      "---  ------           ----- \n",
      " 0   comment          object\n",
      " 1   star             int64 \n",
      " 2   cleaned_comment  object\n",
      " 3   comment_length   int64 \n",
      " 4   words_num        int64 \n",
      "dtypes: int64(3), object(2)\n",
      "memory usage: 109.1+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "dataset = dataset[dataset['words_num'] > 0]\n",
    "print(dataset.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 文本向量化"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 词向量\n",
    "- 使用基于中文维基语料，和 gensim 包的训练出来的词向量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:53:52.505096Z",
     "start_time": "2020-05-14T01:53:50.251714Z"
    }
   },
   "outputs": [],
   "source": [
    "model = Word2Vec.load(model_path)\n",
    "wv = model.wv\n",
    "del model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:53:53.534930Z",
     "start_time": "2020-05-14T01:53:53.529245Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "845989"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(wv.vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:53:56.712250Z",
     "start_time": "2020-05-14T01:53:56.600383Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('张静初', 0.8863259553909302),\n",
       " ('刘烨', 0.8568055629730225),\n",
       " ('廖凡', 0.8530554175376892),\n",
       " ('张震', 0.8500816822052002),\n",
       " ('张涵予', 0.8481923341751099),\n",
       " ('胡军', 0.8465080857276917),\n",
       " ('段奕宏', 0.8462876081466675),\n",
       " ('黄轩', 0.844296932220459),\n",
       " ('黄渤', 0.8436167240142822),\n",
       " ('喻亢', 0.8426705598831177)]"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wv.most_similar(['吴京'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:54:02.650621Z",
     "start_time": "2020-05-14T01:54:02.642026Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "48404"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index = wv.index2word.index('吴京')\n",
    "index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 创建词汇表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:55:46.688405Z",
     "start_time": "2020-05-14T01:55:46.682193Z"
    }
   },
   "outputs": [],
   "source": [
    "def build_vocab(texts):\n",
    "    vocab = {}\n",
    "    for t in texts:\n",
    "        words = t.strip().split()\n",
    "        for word in words:\n",
    "            if word != 'unkown':\n",
    "                vocab[word] = vocab.get(word, 0) + 1\n",
    "    return vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:55:55.435671Z",
     "start_time": "2020-05-14T01:55:47.808140Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "评论总单词数：45852058；词汇表单词个数：342733\n"
     ]
    }
   ],
   "source": [
    "vocab = build_vocab(dataset['cleaned_comment'])\n",
    "\n",
    "print(\"评论总单词数：{}；词汇表单词个数：{}\".format(sum(vocab.values()), len(vocab)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:56:06.873169Z",
     "start_time": "2020-05-14T01:56:06.854602Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'unkown': 0,\n",
       " '吴京': 1,\n",
       " '意淫': 2,\n",
       " '到': 3,\n",
       " '了': 4,\n",
       " '脑残': 5,\n",
       " '的': 6,\n",
       " '地步': 7,\n",
       " '看': 8,\n",
       " '恶心': 9,\n",
       " '想': 10,\n",
       " '吐': 11,\n",
       " '首映礼': 12,\n",
       " '太': 13,\n",
       " '恐怖': 14,\n",
       " '这个': 15,\n",
       " '电影': 16,\n",
       " '不讲道理': 17,\n",
       " '完全': 18,\n",
       " '就是': 19,\n",
       " '在': 20,\n",
       " '实现': 21,\n",
       " '他': 22,\n",
       " '小': 23,\n",
       " '粉红': 24,\n",
       " '英雄': 25,\n",
       " '梦': 26,\n",
       " '各种': 27,\n",
       " '装备': 28,\n",
       " '轮番': 29,\n",
       " '上场': 30,\n",
       " '视': 31,\n",
       " '物理': 32,\n",
       " '逻辑': 33,\n",
       " '于': 34,\n",
       " '不顾': 35,\n",
       " '不得不': 36,\n",
       " '说': 37,\n",
       " '有钱': 38,\n",
       " '真': 39,\n",
       " '好': 40,\n",
       " '随意': 41,\n",
       " '胡闹': 42,\n",
       " '炒作': 43,\n",
       " '水平': 44,\n",
       " '不输': 45,\n",
       " '冯小刚': 46,\n",
       " '但小刚': 47,\n",
       " '至少': 48,\n",
       " '不会': 49,\n",
       " '用': 50,\n",
       " '主旋律': 51,\n",
       " '来': 52,\n",
       " '让': 53,\n",
       " '人': 54,\n",
       " '不': 55,\n",
       " '舒服': 56,\n",
       " '为了': 57,\n",
       " '而': 58,\n",
       " '煽情': 59,\n",
       " '觉得': 60,\n",
       " '是': 61,\n",
       " '个': 62,\n",
       " '大': 63,\n",
       " '做作': 64,\n",
       " '谎言': 65,\n",
       " '家': 66,\n",
       " '7': 67,\n",
       " '29': 68,\n",
       " '更新': 69,\n",
       " '片子': 70,\n",
       " '整体': 71,\n",
       " '不如': 72,\n",
       " '湄公河': 73,\n",
       " '行动': 74,\n",
       " '1': 75,\n",
       " '不够': 76,\n",
       " '流畅': 77,\n",
       " '编剧': 78,\n",
       " '有毒': 79,\n",
       " '台词': 80,\n",
       " '尴尬': 81,\n",
       " '2': 82,\n",
       " '刻意': 83,\n",
       " '显得': 84,\n",
       " '如此': 85,\n",
       " '不合时宜': 86,\n",
       " '又': 87,\n",
       " '多余': 88,\n",
       " '凭良心说': 89,\n",
       " '看到': 90,\n",
       " '不像': 91,\n",
       " '战狼': 92,\n",
       " '续集': 93,\n",
       " '完虐': 94,\n",
       " '中二得': 95,\n",
       " '很': 96,\n",
       " '犯': 97,\n",
       " '我': 98,\n",
       " '中华': 99,\n",
       " '者': 100,\n",
       " '虽远必': 101,\n",
       " '诛': 102,\n",
       " '比': 103,\n",
       " '这句': 104,\n",
       " '话': 105,\n",
       " '还要': 106,\n",
       " '一百倍': 107,\n",
       " '脑子': 108,\n",
       " '东西': 109,\n",
       " '希望': 110,\n",
       " '们': 111,\n",
       " '都': 112,\n",
       " '能': 113,\n",
       " '有': 114,\n",
       " '三星': 115,\n",
       " '半': 116,\n",
       " '实打实': 117,\n",
       " '分': 118,\n",
       " '第一集': 119,\n",
       " '爱国': 120,\n",
       " '内部': 121,\n",
       " '做': 122,\n",
       " '着': 123,\n",
       " '置换': 124,\n",
       " '与': 125,\n",
       " '较劲': 126,\n",
       " '但': 127,\n",
       " '第二集': 128,\n",
       " '才': 129,\n",
       " '真正': 130,\n",
       " '显露': 131,\n",
       " '野心': 132,\n",
       " '终于': 133,\n",
       " '抛弃': 134,\n",
       " '李忠志': 135,\n",
       " '新增': 136,\n",
       " '外来': 137,\n",
       " '班底': 138,\n",
       " '硬件': 139,\n",
       " '实力': 140,\n",
       " '机会': 141,\n",
       " '和': 142,\n",
       " '国际': 143,\n",
       " '接轨': 144,\n",
       " '开篇': 145,\n",
       " '水下': 146,\n",
       " '长镜头': 147,\n",
       " '诸如': 148,\n",
       " '铁丝网': 149,\n",
       " '拦截': 150,\n",
       " 'rpg': 151,\n",
       " '弹头': 152,\n",
       " '细节': 153,\n",
       " '设计': 154,\n",
       " '国产': 155,\n",
       " '动作片': 156,\n",
       " '重新': 157,\n",
       " '封顶': 158,\n",
       " '理念': 159,\n",
       " '上': 160,\n",
       " '它': 161,\n",
       " '甚至': 162,\n",
       " '做到': 163,\n",
       " '绣春刀': 164,\n",
       " '最': 165,\n",
       " '想做到': 166,\n",
       " '那': 167,\n",
       " '部分': 168,\n",
       " '惊险': 169,\n",
       " '大气': 170,\n",
       " '引人入胜': 171,\n",
       " '结合': 172,\n",
       " '不俗': 173,\n",
       " '快': 174,\n",
       " '剪下': 175,\n",
       " '真刀真枪': 176,\n",
       " '不禁': 177,\n",
       " '热血沸腾': 178,\n",
       " '特别': 179,\n",
       " '弹簧床': 180,\n",
       " '架': 181,\n",
       " '挡': 182,\n",
       " '炸弹': 183,\n",
       " '空手': 184,\n",
       " '接': 185,\n",
       " '碎玻璃': 186,\n",
       " '弹匣': 187,\n",
       " '割喉': 188,\n",
       " '等': 189,\n",
       " '帅': 190,\n",
       " '得': 191,\n",
       " '飞起': 192,\n",
       " '就算': 193,\n",
       " '前半段': 194,\n",
       " '铺垫': 195,\n",
       " '节奏': 196,\n",
       " '散漫': 197,\n",
       " '主角': 198,\n",
       " '光环': 199,\n",
       " '开太大': 200,\n",
       " '也': 201,\n",
       " '不怕': 202,\n",
       " '作为': 203,\n",
       " '一个': 204,\n",
       " '中国': 205,\n",
       " '两个': 206,\n",
       " '小时': 207,\n",
       " '弥漫着': 208,\n",
       " '强大': 209,\n",
       " '不可': 210,\n",
       " '侵犯': 211,\n",
       " '氛围': 212,\n",
       " '还是': 213,\n",
       " '那颗': 214,\n",
       " '民族': 215,\n",
       " '自豪': 216,\n",
       " '心': 217,\n",
       " '砰砰': 218,\n",
       " '砰': 219,\n",
       " '跳个': 220,\n",
       " '不停': 221,\n",
       " '15': 222,\n",
       " '100': 223,\n",
       " '冷峰': 224,\n",
       " '这部': 225,\n",
       " '里': 226,\n",
       " '即': 227,\n",
       " '像': 228,\n",
       " '成龙': 229,\n",
       " '像杰': 230,\n",
       " '森斯坦': 231,\n",
       " '森': 232,\n",
       " '体制': 233,\n",
       " '外': 234,\n",
       " '同': 235,\n",
       " '类型': 236,\n",
       " '总是': 237,\n",
       " '代表': 238,\n",
       " '个人': 239,\n",
       " '无能': 240,\n",
       " '政府': 241,\n",
       " '需要': 242,\n",
       " '求助于': 243,\n",
       " '这些': 244,\n",
       " '才能': 245,\n",
       " '解决': 246,\n",
       " '难题': 247,\n",
       " '体现': 248,\n",
       " '价值': 249,\n",
       " '所以': 250,\n",
       " '照抄': 251,\n",
       " '这种': 252,\n",
       " '模式': 253,\n",
       " '实际上': 254,\n",
       " '问题': 255,\n",
       " '我们': 256,\n",
       " '以前': 257,\n",
       " '嘲笑': 258,\n",
       " '英雄主义': 259,\n",
       " '却': 260,\n",
       " '没想到': 261,\n",
       " '捆绑': 262,\n",
       " '爱国主义': 263,\n",
       " '全能': 264,\n",
       " '战士': 265,\n",
       " '更加': 266,\n",
       " '难以': 267,\n",
       " '下咽': 268,\n",
       " '多': 269,\n",
       " '无脑': 270,\n",
       " '信': 271,\n",
       " '戏': 272,\n",
       " '对': 273,\n",
       " '吴京路': 274,\n",
       " '转粉': 275,\n",
       " '最后': 276,\n",
       " '彩蛋': 277,\n",
       " '没有': 278,\n",
       " '理由': 279,\n",
       " '期待': 280,\n",
       " '下': 281,\n",
       " '一部': 282,\n",
       " '假': 283,\n",
       " '嗨': 284,\n",
       " '几处': 285,\n",
       " '情节': 286,\n",
       " '设置': 287,\n",
       " '过于': 288,\n",
       " '彰显': 289,\n",
       " '国家': 290,\n",
       " '自豪感': 291,\n",
       " '稍显': 292,\n",
       " '突兀': 293,\n",
       " '爽片': 294,\n",
       " '打戏': 295,\n",
       " '挺燃': 296,\n",
       " '但是': 297,\n",
       " '故事': 298,\n",
       " '一般': 299,\n",
       " '达康': 300,\n",
       " '书记': 301,\n",
       " '合适': 302,\n",
       " '角色': 303,\n",
       " '赵': 304,\n",
       " '东来': 305,\n",
       " '倒': 306,\n",
       " '张瀚': 307,\n",
       " '太太': 308,\n",
       " '太违': 309,\n",
       " '分钟': 310,\n",
       " '穿越': 311,\n",
       " '回': 312,\n",
       " '偶像剧': 313,\n",
       " '接到': 314,\n",
       " '非洲': 315,\n",
       " '卧底': 316,\n",
       " '冷锋': 317,\n",
       " '报告': 318,\n",
       " '丁义珍': 319,\n",
       " '现在': 320,\n",
       " '请求': 321,\n",
       " '抓捕': 322,\n",
       " '李达康': 323,\n",
       " '这件': 324,\n",
       " '事先': 325,\n",
       " '不要': 326,\n",
       " '声张': 327,\n",
       " '别': 328,\n",
       " '省厅': 329,\n",
       " '知道': 330,\n",
       " '就': 331,\n",
       " '你': 332,\n",
       " '一起': 333,\n",
       " '去': 334,\n",
       " '加上': 335,\n",
       " '同志': 336,\n",
       " '三人': 337,\n",
       " '逮捕': 338,\n",
       " '这次': 339,\n",
       " '行': 340,\n",
       " '叫': 341,\n",
       " '吧': 342,\n",
       " '拍': 343,\n",
       " '喜剧': 344,\n",
       " '整个': 345,\n",
       " '感觉': 346,\n",
       " '挺': 347,\n",
       " '搞笑': 348,\n",
       " '这么': 349,\n",
       " '打': 350,\n",
       " '过': 351,\n",
       " '徐晓冬': 352,\n",
       " '么': 353,\n",
       " '心往': 354,\n",
       " '一处': 355,\n",
       " '劲往': 356,\n",
       " '使': 357,\n",
       " '梦想': 358,\n",
       " '看吧': 359,\n",
       " '第一部': 360,\n",
       " '好太多': 361,\n",
       " '谢谢': 362,\n",
       " '美队': 363,\n",
       " '动作': 364,\n",
       " '指导': 365,\n",
       " '这': 366,\n",
       " '火': 367,\n",
       " '没见识': 368,\n",
       " '开头': 369,\n",
       " '长': 370,\n",
       " '对决': 371,\n",
       " '戏可算': 372,\n",
       " '华语': 373,\n",
       " '顶尖': 374,\n",
       " '存在': 375,\n",
       " '驱逐舰': 376,\n",
       " '导弹': 377,\n",
       " '坦克': 378,\n",
       " '商业片': 379,\n",
       " '狂用': 380,\n",
       " '镜头': 381,\n",
       " '运用': 382,\n",
       " '笑': 383,\n",
       " '点': 384,\n",
       " '插入': 385,\n",
       " '好莱坞': 386,\n",
       " '爆米花': 387,\n",
       " '不功': 388,\n",
       " '不过': 389,\n",
       " '从头': 390,\n",
       " '打到': 391,\n",
       " '尾': 392,\n",
       " '拼': 393,\n",
       " '虽然': 394,\n",
       " '有略': 395,\n",
       " '乱': 396,\n",
       " '时': 397,\n",
       " '因为': 398,\n",
       " '没': 399,\n",
       " '啥': 400,\n",
       " '期望值': 401,\n",
       " '被': 402,\n",
       " '吓了一跳': 403,\n",
       " '吴刚': 404,\n",
       " '谦和': 405,\n",
       " '丁海峰': 406,\n",
       " '老': 407,\n",
       " '三位': 408,\n",
       " '炖': 409,\n",
       " '烂熟': 410,\n",
       " '牛筋': 411,\n",
       " '嚼': 412,\n",
       " '用心': 413,\n",
       " '啊': 414,\n",
       " '导演': 415,\n",
       " '小看': 416,\n",
       " '确实': 417,\n",
       " '下功夫': 418,\n",
       " '拉': 419,\n",
       " '借鉴': 420,\n",
       " '至于': 421,\n",
       " '大家': 422,\n",
       " '比较': 423,\n",
       " '反感': 424,\n",
       " '情绪': 425,\n",
       " '那些': 426,\n",
       " '桥段': 427,\n",
       " '必备': 428,\n",
       " '稍微': 429,\n",
       " '一点': 430,\n",
       " '还': 431,\n",
       " '可以': 432,\n",
       " '接受': 433,\n",
       " '最好': 434,\n",
       " '地方': 435,\n",
       " '掌握': 436,\n",
       " '张弛': 437,\n",
       " '有度': 438,\n",
       " '这点': 439,\n",
       " '难得': 440,\n",
       " '一直': 441,\n",
       " '脑子里': 442,\n",
       " '回响': 443,\n",
       " '片头': 444,\n",
       " '海里': 445,\n",
       " '那场': 446,\n",
       " '戏看': 447,\n",
       " '完': 448,\n",
       " '呆': 449,\n",
       " '下去': 450,\n",
       " '太假': 451,\n",
       " '提前': 452,\n",
       " '离场': 453,\n",
       " '好看': 454,\n",
       " '演技': 455,\n",
       " '棒呆': 456,\n",
       " '符合': 457,\n",
       " '反而': 458,\n",
       " '更': 459,\n",
       " '差': 460,\n",
       " '这一': 461,\n",
       " '放之四海而皆准': 462,\n",
       " '规律': 463,\n",
       " '场面': 464,\n",
       " '越做越': 465,\n",
       " '然而': 466,\n",
       " '伴随': 467,\n",
       " '特效': 468,\n",
       " '升级': 469,\n",
       " '叙事': 470,\n",
       " '变得': 471,\n",
       " '非常': 472,\n",
       " '凌乱': 473,\n",
       " '格局': 474,\n",
       " '颇': 475,\n",
       " '拍成': 476,\n",
       " '黑鹰坠落': 477,\n",
       " '结果': 478,\n",
       " '撑死': 479,\n",
       " '最多': 480,\n",
       " '只是': 481,\n",
       " '官方': 482,\n",
       " '版': 483,\n",
       " '敢死队': 484,\n",
       " '但论': 485,\n",
       " '自我': 486,\n",
       " '角色定位': 487,\n",
       " '能力': 488,\n",
       " '远': 489,\n",
       " '如同': 490,\n",
       " '演员': 491,\n",
       " '出身': 492,\n",
       " '甄子丹': 493,\n",
       " '喜欢': 494,\n",
       " '不是': 495,\n",
       " '装傻': 496,\n",
       " '真傻': 497,\n",
       " '要不是': 498,\n",
       " '真的': 499,\n",
       " '别的': 500,\n",
       " '可': 501,\n",
       " '肯定': 502,\n",
       " '选': 503,\n",
       " '直男癌': 504,\n",
       " '令人发指': 505,\n",
       " '所有': 506,\n",
       " '剧情': 507,\n",
       " '走向': 508,\n",
       " '九十年代': 509,\n",
       " '那套': 510,\n",
       " '照搬': 511,\n",
       " '审美': 512,\n",
       " '事儿': 513,\n",
       " '一时半会儿': 514,\n",
       " '培养': 515,\n",
       " '出来': 516,\n",
       " '整部': 517,\n",
       " '延续': 518,\n",
       " '风格': 519,\n",
       " '热血': 520,\n",
       " '要': 521,\n",
       " '不错': 522,\n",
       " '适合': 523,\n",
       " '演': 524,\n",
       " '军人': 525,\n",
       " '之前': 526,\n",
       " '片段': 527,\n",
       " '念': 528,\n",
       " '劲儿': 529,\n",
       " '来说': 530,\n",
       " '张翰太违': 531,\n",
       " '一': 532,\n",
       " '一股': 533,\n",
       " '雷阵雨': 534,\n",
       " '画风': 535,\n",
       " '目瞪狗': 536,\n",
       " '瘠薄': 537,\n",
       " '人牛': 538,\n",
       " 'b': 539,\n",
       " '硬道理': 540,\n",
       " '隔壁': 541,\n",
       " '建军': 542,\n",
       " '大爷': 543,\n",
       " '你们': 544,\n",
       " '场景': 545,\n",
       " '战斗': 546,\n",
       " '全线': 547,\n",
       " '打斗': 548,\n",
       " '游走': 549,\n",
       " '审查': 550,\n",
       " '红线': 551,\n",
       " '边界': 552,\n",
       " '政治': 553,\n",
       " '安全': 554,\n",
       " '缝隙': 555,\n",
       " '部': 556,\n",
       " '极具': 557,\n",
       " '煽动': 558,\n",
       " '大片': 559,\n",
       " '制作': 560,\n",
       " '精良': 561,\n",
       " '影片': 562,\n",
       " '请': 563,\n",
       " '多来': 564,\n",
       " '胶卷': 565,\n",
       " '挺差': 566,\n",
       " '过度': 567,\n",
       " '部队': 568,\n",
       " '没太多': 569,\n",
       " '展示': 570,\n",
       " '死去': 571,\n",
       " '反正': 572,\n",
       " '吸引': 573,\n",
       " '冲': 574,\n",
       " '为什么': 575,\n",
       " '鄙视': 576,\n",
       " '敢': 577,\n",
       " '开拓': 578,\n",
       " '允许': 579,\n",
       " '他们': 580,\n",
       " '再': 581,\n",
       " '直到': 582,\n",
       " '更好': 583,\n",
       " '拍出': 584,\n",
       " '棒': 585,\n",
       " '出彩': 586,\n",
       " '呢': 587,\n",
       " '火爆': 588,\n",
       " '本片': 589,\n",
       " '必将': 590,\n",
       " '燃爆': 591,\n",
       " '暑期': 592,\n",
       " '厉害': 593,\n",
       " '身为': 594,\n",
       " '武打': 595,\n",
       " '高标准': 596,\n",
       " '枪战': 597,\n",
       " '为': 598,\n",
       " '点赞': 599,\n",
       " '热血男儿': 600,\n",
       " '荷尔蒙': 601,\n",
       " '爆发': 602,\n",
       " '给': 603,\n",
       " '0': 604,\n",
       " '星': 605,\n",
       " '血战': 606,\n",
       " '钢锯': 607,\n",
       " '岭': 608,\n",
       " '会': 609,\n",
       " '歌颂': 610,\n",
       " '宗教': 611,\n",
       " '情怀': 612,\n",
       " '超越': 613,\n",
       " '政权': 614,\n",
       " '当': 615,\n",
       " '只': 616,\n",
       " '明显': 617,\n",
       " '低': 618,\n",
       " '层次': 619,\n",
       " '充满': 620,\n",
       " '现实': 621,\n",
       " '乃至': 622,\n",
       " '投机': 623,\n",
       " '考量': 624,\n",
       " '高下': 625,\n",
       " '立': 626,\n",
       " '见': 627,\n",
       " '请问': 628,\n",
       " '吴京脑': 629,\n",
       " '残': 630,\n",
       " '火箭炮': 631,\n",
       " '吗': 632,\n",
       " '傲气': 633,\n",
       " '雄鹰': 634,\n",
       " '第一': 635,\n",
       " '滴血': 636,\n",
       " '4': 637,\n",
       " '算是': 638,\n",
       " '国内': 639,\n",
       " '片': 640,\n",
       " '准': 641,\n",
       " '钱': 642,\n",
       " '花': 643,\n",
       " '有效': 644,\n",
       " '气魄': 645,\n",
       " '创作': 646,\n",
       " '足够': 647,\n",
       " '真诚': 648,\n",
       " '人物': 649,\n",
       " '连': 650,\n",
       " '张翰': 651,\n",
       " '可爱': 652,\n",
       " '如果': 653,\n",
       " '当年': 654,\n",
       " '那样': 655,\n",
       " '一时': 656,\n",
       " '膨胀': 657,\n",
       " '银幕': 658,\n",
       " '独占': 659,\n",
       " '聚光灯': 660,\n",
       " '走': 661,\n",
       " '扪心自问': 662,\n",
       " '没法': 663,\n",
       " '评价': 664,\n",
       " '全片': 665,\n",
       " '靠': 666,\n",
       " '戏撑': 667,\n",
       " '文戏': 668,\n",
       " '扯淡': 669,\n",
       " '女主角': 670,\n",
       " '毫无': 671,\n",
       " '必要': 672,\n",
       " '只要': 673,\n",
       " '开挂': 674,\n",
       " '牛': 675,\n",
       " '逼': 676,\n",
       " '之处': 677,\n",
       " '在于': 678,\n",
       " '透露': 679,\n",
       " '极': 680,\n",
       " '强烈': 681,\n",
       " '意识形态': 682,\n",
       " '枷锁': 683,\n",
       " '祖国': 684,\n",
       " '面前': 685,\n",
       " '一切': 686,\n",
       " '反动派': 687,\n",
       " '纸老虎': 688,\n",
       " '人开': 689,\n",
       " '挂': 690,\n",
       " '团灭': 691,\n",
       " '合情合理': 692,\n",
       " '两星': 693,\n",
       " '鼓励': 694,\n",
       " '其他': 695,\n",
       " '一般般': 696,\n",
       " '看点': 697,\n",
       " '有点': 698,\n",
       " '手接': 699,\n",
       " '哈哈哈': 700,\n",
       " '从': 701,\n",
       " '之后': 702,\n",
       " '炸': 703,\n",
       " '翻': 704,\n",
       " '一下': 705,\n",
       " '四星': 706,\n",
       " '当时': 707,\n",
       " '其实': 708,\n",
       " '完成度': 709,\n",
       " '接近': 710,\n",
       " '每个': 711,\n",
       " '步骤': 712,\n",
       " '顺滑': 713,\n",
       " '任何': 714,\n",
       " '出人意料': 715,\n",
       " '是因为': 716,\n",
       " '看看': 717,\n",
       " '最近': 718,\n",
       " '世界': 719,\n",
       " '抱歉': 720,\n",
       " '影院': 721,\n",
       " '燃': 722,\n",
       " '起来': 723,\n",
       " '魔幻': 724,\n",
       " '当然': 725,\n",
       " '强拆': 726,\n",
       " '现实感': 727,\n",
       " '一幕': 728,\n",
       " '开场': 729,\n",
       " '6': 730,\n",
       " '搏斗': 731,\n",
       " '从来': 732,\n",
       " '其它': 733,\n",
       " '拍摄': 734,\n",
       " '难度': 735,\n",
       " '同时': 736,\n",
       " '技能': 737,\n",
       " '方面': 738,\n",
       " '要求': 739,\n",
       " '回来': 740,\n",
       " '搜': 741,\n",
       " '吴京会': 742,\n",
       " '游泳': 743,\n",
       " '潜水': 744,\n",
       " '滑雪': 745,\n",
       " '开': 746,\n",
       " '飞机': 747,\n",
       " '射击': 748,\n",
       " '各项': 749,\n",
       " '特意': 750,\n",
       " '特种部队': 751,\n",
       " '当过': 752,\n",
       " '18': 753,\n",
       " '月': 754,\n",
       " '兵': 755,\n",
       " '佩服': 756,\n",
       " '这样': 757,\n",
       " '3': 758,\n",
       " '星半': 759,\n",
       " '结束': 760,\n",
       " '掌声': 761,\n",
       " '出现': 762,\n",
       " '近期': 763,\n",
       " '少见': 764,\n",
       " '一粒': 765,\n",
       " '大补丸': 766,\n",
       " '有人': 767,\n",
       " '吃': 768,\n",
       " '开心': 769,\n",
       " '补大': 770,\n",
       " '从白': 771,\n",
       " '黑': 772,\n",
       " '字幕': 773,\n",
       " '展现': 774,\n",
       " '超级': 775,\n",
       " '直': 776,\n",
       " '男': 777,\n",
       " '糙': 778,\n",
       " '猛': 779,\n",
       " '媲美': 780,\n",
       " '终结者': 781,\n",
       " '5': 782,\n",
       " '无亮点': 783,\n",
       " '张翰变': 784,\n",
       " '谐星': 785,\n",
       " '3d': 786,\n",
       " '掌控': 787,\n",
       " '逼近': 788,\n",
       " 'hold': 789,\n",
       " '不住': 790,\n",
       " '边缘': 791,\n",
       " '带感': 792,\n",
       " '拳拳': 793,\n",
       " '肉': 794,\n",
       " '超爽': 795,\n",
       " '聪明': 796,\n",
       " '鸡': 797,\n",
       " '贼': 798,\n",
       " '一面': 799,\n",
       " '旗下': 800,\n",
       " '呈现': 801,\n",
       " '一出': 802,\n",
       " '重工业': 803,\n",
       " '娱乐': 804,\n",
       " '调控': 805,\n",
       " '说教': 806,\n",
       " '比例': 807,\n",
       " '尺度': 808,\n",
       " '大众': 809,\n",
       " '接纳': 810,\n",
       " '把握': 811,\n",
       " '微妙': 812,\n",
       " '其中': 813,\n",
       " '一些': 814,\n",
       " '奇侠': 815,\n",
       " '化': 816,\n",
       " '内容': 817,\n",
       " '比如': 818,\n",
       " '玻璃碴': 819,\n",
       " '子当': 820,\n",
       " '飞镖': 821,\n",
       " '杀敌': 822,\n",
       " '一类': 823,\n",
       " '只不过': 824,\n",
       " '遮盖': 825,\n",
       " '掉': 826,\n",
       " '老爹': 827,\n",
       " '演过': 828,\n",
       " '美剧': 829,\n",
       " '搏击': 830,\n",
       " '王国': 831,\n",
       " '力荐': 832,\n",
       " '那部': 833,\n",
       " '为啥': 834,\n",
       " '奇异': 835,\n",
       " '恩典': 836,\n",
       " '配乐': 837,\n",
       " '画内': 838,\n",
       " '男生': 839,\n",
       " '的话': 840,\n",
       " '应该': 841,\n",
       " '刺激': 842,\n",
       " '肾上腺素': 843,\n",
       " '女生': 844,\n",
       " '对龙小云': 845,\n",
       " '感情': 846,\n",
       " '十分': 847,\n",
       " '打动': 848,\n",
       " '模仿': 849,\n",
       " '许多': 850,\n",
       " '怎么': 851,\n",
       " '玩': 852,\n",
       " '一股脑': 853,\n",
       " '堆': 854,\n",
       " '槽': 855,\n",
       " '几位': 856,\n",
       " '血厚到': 857,\n",
       " '科幻': 858,\n",
       " '级别': 859,\n",
       " '重复': 860,\n",
       " '满血': 861,\n",
       " '红血': 862,\n",
       " '中毒': 863,\n",
       " '极速': 864,\n",
       " '回血': 865,\n",
       " '爆种': 866,\n",
       " '打通': 867,\n",
       " '全场': 868,\n",
       " '太过': 869,\n",
       " '投机取巧': 870,\n",
       " '穿': 871,\n",
       " '迈克尔': 872,\n",
       " '贝都': 873,\n",
       " '不受': 874,\n",
       " '待见': 875,\n",
       " '国片': 876,\n",
       " '前仆后继': 877,\n",
       " '爆炸': 878,\n",
       " 'high': 879,\n",
       " '瞎燃': 880,\n",
       " '没用': 881,\n",
       " '10': 882,\n",
       " '女人': 883,\n",
       " '缺': 884,\n",
       " '男人': 885,\n",
       " '征服': 886,\n",
       " '吴京直': 887,\n",
       " '男癌': 888,\n",
       " '🇨': 889,\n",
       " '🇳': 890,\n",
       " '美国': 891,\n",
       " '不行': 892,\n",
       " '死': 893,\n",
       " '全都': 894,\n",
       " '跳': 895,\n",
       " '跟': 896,\n",
       " '跳墙': 897,\n",
       " '一样': 898,\n",
       " '拯救': 899,\n",
       " '国产片': 900,\n",
       " '以': 901,\n",
       " '中印': 902,\n",
       " '局势': 903,\n",
       " '对比': 904,\n",
       " '假想': 905,\n",
       " '真是': 906,\n",
       " '讽刺': 907,\n",
       " '谄媚': 908,\n",
       " '军旅': 909,\n",
       " '题材': 910,\n",
       " '质感': 911,\n",
       " '燃到': 912,\n",
       " '国外': 913,\n",
       " '精彩': 914,\n",
       " '看着': 915,\n",
       " '有力': 916,\n",
       " '必须': 917,\n",
       " '安利': 918,\n",
       " '一下张': 919,\n",
       " '翰': 920,\n",
       " '简直': 921,\n",
       " '承包': 922,\n",
       " '笑点': 923,\n",
       " '量身定做': 924,\n",
       " '彭于': 925,\n",
       " '晏': 926,\n",
       " '可演': 927,\n",
       " '不来': 928,\n",
       " '不少': 929,\n",
       " '漂移': 930,\n",
       " '无人机': 931,\n",
       " '突袭': 932,\n",
       " '直升机': 933,\n",
       " '坠露': 934,\n",
       " '肉搏': 935,\n",
       " '军舰': 936,\n",
       " '发射': 937,\n",
       " '叛乱': 938,\n",
       " '国际化': 939,\n",
       " '视角': 940,\n",
       " '标配': 941,\n",
       " '饰演': 942,\n",
       " '深入人心': 943,\n",
       " '搏命': 944,\n",
       " '精神': 945,\n",
       " '当下': 946,\n",
       " '第三部': 947,\n",
       " '好燃': 948,\n",
       " '表白': 949,\n",
       " '典型': 950,\n",
       " '方式': 951,\n",
       " '每次': 952,\n",
       " '猜': 953,\n",
       " '没劲': 954,\n",
       " '诶': 955,\n",
       " '问': 956,\n",
       " '王牌': 957,\n",
       " '特工': 958,\n",
       " '那么': 959,\n",
       " '杀人': 960,\n",
       " '经过': 961,\n",
       " '艺术': 962,\n",
       " '处理': 963,\n",
       " '直接': 964,\n",
       " '删': 965,\n",
       " '血腥': 966,\n",
       " '屠杀': 967,\n",
       " '赤裸裸': 968,\n",
       " '大段': 969,\n",
       " '正确': 970,\n",
       " '庇衣': 971,\n",
       " '意料之中': 972,\n",
       " '意料之外': 973,\n",
       " '惊喜': 974,\n",
       " '属于': 975,\n",
       " '狼性': 976,\n",
       " '军魂': 977,\n",
       " '几个': 978,\n",
       " '网红拉': 979,\n",
       " '弹弹琴': 980,\n",
       " '大国': 981,\n",
       " '气象': 982,\n",
       " '满屏': 983,\n",
       " 'tm': 984,\n",
       " '告诉': 985,\n",
       " '吴': 986,\n",
       " '迪塞尔': 987,\n",
       " '如入无人之境': 988,\n",
       " '亿': 989,\n",
       " '大陆': 990,\n",
       " '一刻': 991,\n",
       " '集体': 992,\n",
       " '勃起': 993,\n",
       " '离开': 994,\n",
       " '影厅': 995,\n",
       " '屌丝': 996,\n",
       " '同样': 997,\n",
       " '开始': 998,\n",
       " '前': 999,\n",
       " ...}"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word2index = {'unkown': 0}\n",
    "for word, _ in vocab.items():\n",
    "    word2index[word] = len(word2index)\n",
    "\n",
    "word2index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 评论中没有词向量中的词\n",
    "- 解决措施：继续分词\n",
    "> **拼写纠错??：如“鸡冻”--> “激动”**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:57:08.611877Z",
     "start_time": "2020-05-14T01:57:08.605531Z"
    }
   },
   "outputs": [],
   "source": [
    "def check_coverage(vocab, wv):\n",
    "    known_words = {}\n",
    "    unknown_words = {}\n",
    "    no_known_words = 0\n",
    "    no_unknown_words = 0\n",
    "    for word in vocab:\n",
    "        try:\n",
    "            known_words[word] = wv[word]\n",
    "            no_known_words += vocab[word]\n",
    "        except:\n",
    "            unknown_words[word] = vocab[word]\n",
    "            no_unknown_words += vocab[word]\n",
    "    print('词汇表中 {:.2%} 的单词有词向量'.format(len(known_words) / len(vocab)))\n",
    "    print('评论的所有单词中 {:.2%} 的单词有词向量'.format(\n",
    "        no_known_words / (no_known_words + no_unknown_words)))\n",
    "    unknown_words = sorted(unknown_words.items(),\n",
    "                           key=operator.itemgetter(1))[::-1]\n",
    "    return unknown_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:57:13.583231Z",
     "start_time": "2020-05-14T01:57:13.109146Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "词汇表中 46.14% 的单词有词向量\n",
      "评论的所有单词中 97.08% 的单词有词向量\n"
     ]
    }
   ],
   "source": [
    "unknown_words = check_coverage(vocab, wv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:57:56.124138Z",
     "start_time": "2020-05-14T01:57:56.115046Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "184582"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(unknown_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T01:57:58.400788Z",
     "start_time": "2020-05-14T01:57:58.350114Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('ahhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhotdwarvesatyourservice',\n",
       "  1),\n",
       " ('blahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblah',\n",
       "  1),\n",
       " ('hiusjufbjijhjjsfightdjsiosnwnbombsisjwnexplosionwhshbdbwwhatfuckjusthappenddhsjsndhdjfighrekizjekilldbiejsshotdhsjsnthatbitchsjdjjffuckjejsh',\n",
       "  1),\n",
       " ('111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111',\n",
       "  1),\n",
       " ('100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000',\n",
       "  1),\n",
       " ('teateatatteateatatteateatatteateatatteateatatteateatatteateatatteateatatteateatatteateatatteateatatteateatat',\n",
       "  1),\n",
       " ('11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111',\n",
       "  1),\n",
       " ('蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤',\n",
       "  1),\n",
       " ('10000000000000000000000000000000000000000000000000000000000000000000000000000000000000',\n",
       "  1),\n",
       " ('wwwwwwwwwwwwwwwwwwwwttttttttttttttttttttttttttttttffffffffffffffffffffffffffffffff',\n",
       "  1),\n",
       " ('bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbest',\n",
       "  2),\n",
       " ('yoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo',\n",
       "  1),\n",
       " ('bce43fd38b93b6c518d056245a8a4de28880c502e41eebc1978142bcb20fadc9bc54d7cbb6a0',\n",
       "  1),\n",
       " ('99999999999999999999999999999999999999999999999999999999999999999999999999',\n",
       "  1),\n",
       " ('7b7b9e9f07a045731271f8ca2db2d97ad27271b6192c204242b7372614b47dff', 1),\n",
       " ('sssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss', 1),\n",
       " ('1111111111111111111111111111111111111111111111111lhkjlhhlkjhl', 1),\n",
       " ('wojiushibuxiangkandaonaocanshuijunshuachulaidefen', 1),\n",
       " ('shiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiit', 1),\n",
       " ('xl3non7u0i5ftb4lwfdosdtwfdptejr11qzwmn4pgluq', 1),\n",
       " ('prprprprprprprprprprprprprprprprprprprprpr', 2),\n",
       " ('zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz', 1),\n",
       " ('97db28f635ac65285bac7790ca7a36ca2234679201', 1),\n",
       " ('youcanwalkawayknowingyouwererightallalong', 1),\n",
       " ('aaaaaahhhhhhhhhhhhhhhaaaaaaaaaaaahhhhhhhh', 1),\n",
       " ('hduom0ad9hz4v1f2wrlghrnpbanywf3qlljf84qgp', 1),\n",
       " ('5a2327c04c0c231bced131ddf3f4467eb80c1c86', 2),\n",
       " ('3a3e3c800a74582d2295df84c6624848bd6ed54a', 2),\n",
       " ('4ee833da28c2d1e1b7b377769038f2a699eb7394', 2),\n",
       " ('a731364e7b392af2f5b7bc897328cba450f9f95d', 2),\n",
       " ('1dfd74a56054b289a4a244630d7ed1e70687f21b', 2),\n",
       " ('9aa786a606eee1f6a26c3411a67731de2dd7c877', 2),\n",
       " ('d9dea7ecb35d8b2d1c4b9141f7d8a18a4f24c559', 2),\n",
       " ('sdfudnctirmejweebcdfgyt9mencetr672g8ufed', 1),\n",
       " ('yooooooooooooooooooooooooooooooooooooooo', 1),\n",
       " ('e0befba921ff7cc558e33ce2b37dece4501b1777', 1),\n",
       " ('878079ca0234895c28024acb1fb2ba08ec5237d3', 1),\n",
       " ('2520c01967207a1735171056ec588c8c1257e5f8', 1),\n",
       " ('gggggggggggggggggggggggggggggggggggggggg', 1),\n",
       " ('e022283c58bf58aa733e61d2977ca60e513a0e14', 1),\n",
       " ('5438712a5b1db87f0e5e69d9f1fe21e023864e14', 1),\n",
       " ('251f2c4a8392d2c4c73d7798d666c65690bfe1af', 1),\n",
       " ('7f3467e5f6ba2b866c1ef7029a113db4c33311dc', 1),\n",
       " ('523452693ed265f9de29558b4cfc1960c895b24a', 1),\n",
       " ('a5f5bf93f4a1fcfcc688b2b3255e0a20958751f2', 1),\n",
       " ('d6b6d9084ddc8542b188f6681d1c085f51dfb7e1', 1),\n",
       " ('zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz', 1),\n",
       " ('duangduangduangbiubiubiuduangduangduang', 1),\n",
       " ('230444e633190e18301c67194fbe307d0a4bf3', 1),\n",
       " ('dw5pb25fawq9mtaznty1xzewmdawmv8wmv8wmq', 1)]"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 无词向量的 单词\n",
    "sorted(unknown_words, key=lambda w: len(w[0]), reverse=True)[:50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:00:13.256046Z",
     "start_time": "2020-05-14T02:00:13.229383Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('5', 36985),\n",
       " ('3', 36146),\n",
       " ('2', 34218),\n",
       " ('1', 32594),\n",
       " ('3d', 30606),\n",
       " ('4', 23395),\n",
       " ('美队', 14923),\n",
       " ('7', 14083),\n",
       " ('8', 13582),\n",
       " ('😂', 10231),\n",
       " ('10', 9704),\n",
       " ('星给', 9400),\n",
       " ('6', 9342),\n",
       " ('9', 7269),\n",
       " ('0', 7264),\n",
       " ('一般般', 6970),\n",
       " ('12', 6880),\n",
       " ('尿点', 6691),\n",
       " ('2016', 6165),\n",
       " ('加一星', 5719),\n",
       " ('⋯', 4829),\n",
       " ('20', 4604),\n",
       " ('残粉', 4342),\n",
       " ('😭', 3975),\n",
       " ('打一星', 3876),\n",
       " ('90', 3689),\n",
       " ('脑残粉', 3674),\n",
       " ('2015', 3547),\n",
       " ('老谋子', 3527),\n",
       " ('80', 3506),\n",
       " ('30', 3449),\n",
       " ('复联', 3245),\n",
       " ('泪目', 3243),\n",
       " ('100', 3125),\n",
       " ('刚看', 2987),\n",
       " ('无尿点', 2902),\n",
       " ('️', 2796),\n",
       " ('诚哥', 2781),\n",
       " ('👍', 2759),\n",
       " ('￣', 2703),\n",
       " ('讲真', 2703),\n",
       " ('╯', 2568),\n",
       " ('15', 2566),\n",
       " ('❤', 2428),\n",
       " ('搞笑片', 2397),\n",
       " ('逼格', 2361),\n",
       " ('╰', 2334),\n",
       " ('太赞', 2329),\n",
       " ('╭', 2196),\n",
       " ('╮', 2169)]"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 出现次数最多的无词向量单词\n",
    "sorted(unknown_words, key=lambda w: w[1], reverse=True)[:50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:00:32.077160Z",
     "start_time": "2020-05-14T02:00:32.071520Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'2' in wv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 没有词向量的汉语短语"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:00:52.864893Z",
     "start_time": "2020-05-14T02:00:52.859094Z"
    }
   },
   "outputs": [],
   "source": [
    "def is_chinese(str):\n",
    "    for s in str:\n",
    "        if u'\\u4e00' <= s <= u'\\u9fff':\n",
    "            return True\n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:00:53.783978Z",
     "start_time": "2020-05-14T02:00:53.778751Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "is_chinese('君子之交淡如水')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:01:07.565856Z",
     "start_time": "2020-05-14T02:01:07.502681Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "167519"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unknown_chinese = [(w, c) for w, c in unknown_words if is_chinese(w)]\n",
    "len(unknown_chinese)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:01:09.559430Z",
     "start_time": "2020-05-14T02:01:09.517983Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤',\n",
       "  1),\n",
       " ('尼尼尼尼尼尼尼尼尼尼尼尼', 1),\n",
       " ('求范范范范范范范范范范爷', 1),\n",
       " ('丈二和尚摸不着头脑', 8),\n",
       " ('冰冻三尺非一日之寒', 1),\n",
       " ('九百六十万平方公里', 1),\n",
       " ('八仙过海各显神通', 37),\n",
       " ('巧妇难为无米之炊', 5),\n",
       " ('百尺竿头更进一步', 5),\n",
       " ('燕雀安知鸿鹄之志', 3)]"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(unknown_chinese, key=lambda w: len(w[0]), reverse=True)[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:01:18.995756Z",
     "start_time": "2020-05-14T02:01:18.973082Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('美队', 14923),\n",
       " ('星给', 9400),\n",
       " ('一般般', 6970),\n",
       " ('尿点', 6691),\n",
       " ('加一星', 5719),\n",
       " ('残粉', 4342),\n",
       " ('打一星', 3876),\n",
       " ('脑残粉', 3674),\n",
       " ('老谋子', 3527),\n",
       " ('复联', 3245)]"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(unknown_chinese, key=lambda w: w[1], reverse=True)[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> **没有词向量的汉语短语，前向最大匹配继续分词**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:32:54.491450Z",
     "start_time": "2020-05-14T02:32:54.484920Z"
    }
   },
   "outputs": [],
   "source": [
    "def cut(s):\n",
    "    words = []\n",
    "    n = len(s)\n",
    "    start = 0\n",
    "    while start < n:\n",
    "        end = n\n",
    "        while start <= end <= n:\n",
    "            word = s[start:end]\n",
    "            if word in wv:\n",
    "                if not words or word != words[-1]:\n",
    "                    words.append(word)\n",
    "                start = end\n",
    "                break\n",
    "            else:\n",
    "                end -= 1\n",
    "        if start < end:\n",
    "            words.append('unknown')\n",
    "            start += 1\n",
    "    return words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:33:35.079781Z",
     "start_time": "2020-05-14T02:33:35.038655Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤蛤: ['蛤']\n",
      "尼尼尼尼尼尼尼尼尼尼尼尼: ['尼尼']\n",
      "求范范范范范范范范范范爷: ['求', '范范', '爷']\n",
      "丈二和尚摸不着头脑: ['丈二', '和尚', '摸不着头脑']\n",
      "冰冻三尺非一日之寒: ['冰冻', '三尺', '非一', '日', '之寒']\n",
      "九百六十万平方公里: ['九百', '六十万', '平方公里']\n",
      "八仙过海各显神通: ['八仙过海', '各显神通']\n",
      "巧妇难为无米之炊: ['巧妇', '难为', '无米', '之', '炊']\n",
      "百尺竿头更进一步: ['百尺竿头', '更进一步']\n",
      "燕雀安知鸿鹄之志: ['燕雀', '安知', '鸿鹄', '之志']\n"
     ]
    }
   ],
   "source": [
    "for w, _ in sorted(unknown_chinese, key=lambda w: len(w[0]),\n",
    "                   reverse=True)[:10]:\n",
    "    print(w + ':', cut(w))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:34:01.372917Z",
     "start_time": "2020-05-14T02:34:01.351220Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "美队: ['美', '队']\n",
      "星给: ['星', '给']\n",
      "一般般: ['一般', '般']\n",
      "尿点: ['尿', '点']\n",
      "加一星: ['加一', '星']\n",
      "残粉: ['残', '粉']\n",
      "打一星: ['打一', '星']\n",
      "脑残粉: ['脑残', '粉']\n",
      "老谋子: ['老', '谋', '子']\n",
      "复联: ['复', '联']\n"
     ]
    }
   ],
   "source": [
    "for w, _ in sorted(unknown_chinese, key=lambda w: w[1], reverse=True)[:10]:\n",
    "    print(w + ':', cut(w))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 全英文字符短语"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:34:56.242436Z",
     "start_time": "2020-05-14T02:34:56.237415Z"
    }
   },
   "outputs": [],
   "source": [
    "def is_string(str):\n",
    "    for s in str:\n",
    "        if s not in string.ascii_lowercase:\n",
    "            return False\n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:34:57.313527Z",
     "start_time": "2020-05-14T02:34:57.258736Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7777"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unknown_string = [(w, c) for w, c in unknown_words if is_string(w)]\n",
    "len(unknown_string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:34:59.129336Z",
     "start_time": "2020-05-14T02:34:59.102635Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('wyf', 237),\n",
       " ('qwq', 206),\n",
       " ('rdj', 145),\n",
       " ('tfboy', 140),\n",
       " ('shirly', 131),\n",
       " ('hhhhh', 127),\n",
       " ('mdzz', 127),\n",
       " ('exm', 119),\n",
       " ('xddd', 97),\n",
       " ('happyending', 88),\n",
       " ('xmen', 87),\n",
       " ('puny', 82),\n",
       " ('balabala', 78),\n",
       " ('biubiubiu', 76),\n",
       " ('blabla', 75),\n",
       " ('dvdrip', 73),\n",
       " ('hhhhhh', 65),\n",
       " ('antman', 64),\n",
       " ('anglebaby', 64),\n",
       " ('teamcap', 63),\n",
       " ('everthing', 62),\n",
       " ('quq', 61),\n",
       " ('qvq', 56),\n",
       " ('duangduangduang', 55),\n",
       " ('ccav', 52),\n",
       " ('mitsuha', 49),\n",
       " ('mlgb', 49),\n",
       " ('sherly', 48),\n",
       " ('blablabla', 48),\n",
       " ('prpr', 48),\n",
       " ('spiderboy', 45),\n",
       " ('hhhhhhh', 45),\n",
       " ('hodor', 44),\n",
       " ('duangduang', 44),\n",
       " ('himym', 43),\n",
       " ('shirely', 41),\n",
       " ('prprpr', 41),\n",
       " ('teamironman', 40),\n",
       " ('stucky', 40),\n",
       " ('blingbling', 39),\n",
       " ('nnd', 39),\n",
       " ('wwwww', 39),\n",
       " ('hiahia', 36),\n",
       " ('screenx', 33),\n",
       " ('xxoo', 33),\n",
       " ('xdddd', 32),\n",
       " ('hiahiahia', 32),\n",
       " ('zhihu', 32),\n",
       " ('shined', 31),\n",
       " ('weixin', 31),\n",
       " ('hahahaha', 30),\n",
       " ('angelbaby', 30),\n",
       " ('acdc', 29),\n",
       " ('konw', 28),\n",
       " ('wwz', 26),\n",
       " ('javis', 26),\n",
       " ('planb', 26),\n",
       " ('qnmlgb', 26),\n",
       " ('haokan', 26),\n",
       " ('ppps', 26),\n",
       " ('laji', 25),\n",
       " ('mavel', 25),\n",
       " ('zhuangbility', 25),\n",
       " ('bdrip', 24),\n",
       " ('hxm', 24),\n",
       " ('piapiapia', 24),\n",
       " ('wjk', 23),\n",
       " ('shenmegui', 23),\n",
       " ('biubiu', 23),\n",
       " ('sooooo', 23),\n",
       " ('boomboomboom', 22),\n",
       " ('wqnmlgb', 22),\n",
       " ('xddddd', 22),\n",
       " ('paoking', 21),\n",
       " ('xland', 21),\n",
       " ('qmdb', 21),\n",
       " ('tnnd', 21),\n",
       " ('lowb', 20),\n",
       " ('spidey', 20),\n",
       " ('xjb', 20),\n",
       " ('roseonly', 19),\n",
       " ('loveit', 19),\n",
       " ('tieba', 19),\n",
       " ('endding', 19),\n",
       " ('insideout', 18),\n",
       " ('anglababy', 18),\n",
       " ('hhhhhhhhh', 18),\n",
       " ('soooooo', 18),\n",
       " ('wwwwww', 18),\n",
       " ('hanhan', 17),\n",
       " ('kjj', 17),\n",
       " ('polymax', 17),\n",
       " ('dvdscr', 17),\n",
       " ('tbbt', 17),\n",
       " ('bjiff', 17),\n",
       " ('blahblah', 17),\n",
       " ('hitchitsch', 17),\n",
       " ('btih', 17),\n",
       " ('shuhua', 16),\n",
       " ('pyy', 16),\n",
       " ('gaygay', 16),\n",
       " ('tdkr', 16),\n",
       " ('zdy', 15),\n",
       " ('thats', 15),\n",
       " ('sukida', 14),\n",
       " ('lowbee', 14),\n",
       " ('superheros', 14),\n",
       " ('johnnydepp', 14),\n",
       " ('hahahahaha', 14),\n",
       " ('carrots', 14),\n",
       " ('mediocre', 14),\n",
       " ('tryer', 13),\n",
       " ('clawhauser', 13),\n",
       " ('zootropolis', 13),\n",
       " ('rmrb', 13),\n",
       " ('wbq', 13),\n",
       " ('starsare', 13),\n",
       " ('harsher', 13),\n",
       " ('kown', 13),\n",
       " ('sooo', 13),\n",
       " ('sooooooo', 13),\n",
       " ('spidy', 13),\n",
       " ('jlo', 13),\n",
       " ('yyets', 13),\n",
       " ('taotie', 12),\n",
       " ('xinren', 12),\n",
       " ('diaosi', 12),\n",
       " ('yooo', 12),\n",
       " ('iamx', 12),\n",
       " ('awwww', 12),\n",
       " ('nozuonodie', 12),\n",
       " ('pathetic', 12),\n",
       " ('willsmith', 12),\n",
       " ('ttatt', 12),\n",
       " ('cineworld', 12),\n",
       " ('everying', 11),\n",
       " ('tlou', 11),\n",
       " ('zzzq', 11),\n",
       " ('emmastone', 11),\n",
       " ('dreamit', 11),\n",
       " ('exome', 11),\n",
       " ('qaqqq', 11),\n",
       " ('maggieq', 11),\n",
       " ('linkinpark', 10),\n",
       " ('changelababy', 10),\n",
       " ('dreamfoolish', 10),\n",
       " ('shabi', 10),\n",
       " ('gdzj', 10),\n",
       " ('doubanapp', 10),\n",
       " ('capitain', 10),\n",
       " ('dwz', 10),\n",
       " ('prprprpr', 10),\n",
       " ('xjbd', 10),\n",
       " ('blx', 10),\n",
       " ('judie', 10),\n",
       " ('congroo', 10),\n",
       " ('qnmd', 10),\n",
       " ('kevinspacey', 10),\n",
       " ('sjb', 10),\n",
       " ('nmlgb', 10),\n",
       " ('clinteastwood', 10),\n",
       " ('everting', 9),\n",
       " ('zpd', 9),\n",
       " ('takisu', 9),\n",
       " ('illion', 9),\n",
       " ('zym', 9),\n",
       " ('mojin', 9),\n",
       " ('shawarma', 9),\n",
       " ('hlod', 9),\n",
       " ('boger', 9),\n",
       " ('hhhhhhhhhhh', 9),\n",
       " ('sheirly', 9),\n",
       " ('tttt', 9),\n",
       " ('sterotype', 9),\n",
       " ('hmmm', 9),\n",
       " ('xswl', 9),\n",
       " ('zmax', 9),\n",
       " ('hahah', 9),\n",
       " ('bongbongbong', 9),\n",
       " ('qnmb', 9),\n",
       " ('pretentious', 9),\n",
       " ('kongfu', 9),\n",
       " ('cinematheque', 9),\n",
       " ('dbox', 9),\n",
       " ('jjyy', 9),\n",
       " ('cooooool', 9),\n",
       " ('tomhanks', 9),\n",
       " ('gossipgirl', 8),\n",
       " ('achehere', 8),\n",
       " ('seemhere', 8),\n",
       " ('withtorry', 8),\n",
       " ('ctmd', 8),\n",
       " ('anyways', 8),\n",
       " ('wwwwwww', 8),\n",
       " ('zby', 8),\n",
       " ('bibibi', 8),\n",
       " ('biubiubiubiu', 8),\n",
       " ('cinemark', 8),\n",
       " ('hulkbuster', 8),\n",
       " ('lowlow', 8),\n",
       " ('bgcp', 8),\n",
       " ('merde', 8),\n",
       " ('soooooooo', 8),\n",
       " ('embarrassing', 8),\n",
       " ('bility', 8),\n",
       " ('didnt', 8),\n",
       " ('megryan', 8),\n",
       " ('zzzzz', 8),\n",
       " ('awsome', 8),\n",
       " ('cnmb', 8),\n",
       " ('jimcarrey', 8),\n",
       " ('balmes', 8),\n",
       " ('everyting', 7),\n",
       " ('taotei', 7),\n",
       " ('sawadika', 7),\n",
       " ('drifted', 7),\n",
       " ('hhhhhhhhhh', 7),\n",
       " ('puke', 7),\n",
       " ('yooooooo', 7),\n",
       " ('sherley', 7),\n",
       " ('haixing', 7),\n",
       " ('yoooooooo', 7),\n",
       " ('gtmd', 7),\n",
       " ('orzzz', 7),\n",
       " ('jjww', 7),\n",
       " ('marval', 7),\n",
       " ('piupiupiu', 7),\n",
       " ('hanni', 7),\n",
       " ('dxy', 7),\n",
       " ('rylance', 7),\n",
       " ('depressing', 7),\n",
       " ('satc', 7),\n",
       " ('labeouf', 7),\n",
       " ('woodyallen', 7),\n",
       " ('bingbong', 7),\n",
       " ('bluesliver', 7),\n",
       " ('wower', 7),\n",
       " ('cmct', 7),\n",
       " ('ttutt', 7),\n",
       " ('goodending', 7),\n",
       " ('robertdeniro', 7),\n",
       " ('amazingly', 7),\n",
       " ('lmao', 7),\n",
       " ('gscas', 7),\n",
       " ('jasonstatham', 7),\n",
       " ('urself', 7),\n",
       " ('hanmeimei', 7),\n",
       " ('shaki', 6),\n",
       " ('blacksad', 6),\n",
       " ('xneil', 6),\n",
       " ('zyq', 6),\n",
       " ('holk', 6),\n",
       " ('upupup', 6),\n",
       " ('kzd', 6),\n",
       " ('anglelababy', 6),\n",
       " ('facepalm', 6),\n",
       " ('mamamia', 6),\n",
       " ('ryangosling', 6),\n",
       " ('shanghaied', 6),\n",
       " ('xdddddd', 6),\n",
       " ('yinsen', 6),\n",
       " ('xddddddd', 6),\n",
       " ('zxc', 6),\n",
       " ('overated', 6),\n",
       " ('shriley', 6),\n",
       " ('cnmlgb', 6),\n",
       " ('bucuo', 6),\n",
       " ('captin', 6),\n",
       " ('ywx', 6),\n",
       " ('zqsg', 6),\n",
       " ('tryeverything', 6),\n",
       " ('excuseme', 6),\n",
       " ('bobototo', 6),\n",
       " ('hohoho', 6),\n",
       " ('pointless', 6),\n",
       " ('zyz', 6),\n",
       " ('hmmmm', 6),\n",
       " ('bulabula', 6),\n",
       " ('undatable', 6),\n",
       " ('undateable', 6),\n",
       " ('kickass', 6),\n",
       " ('messed', 6),\n",
       " ('shareid', 6),\n",
       " ('xzd', 6),\n",
       " ('hehehe', 6),\n",
       " ('kyxq', 6),\n",
       " ('amiable', 6),\n",
       " ('doesnt', 6),\n",
       " ('clotaire', 6),\n",
       " ('angelinajolie', 6),\n",
       " ('anyhow', 6),\n",
       " ('hughjackman', 6),\n",
       " ('xiami', 6),\n",
       " ('blablablabla', 6),\n",
       " ('timburton', 6),\n",
       " ('wwwwwwwww', 6),\n",
       " ('mlgbd', 6),\n",
       " ('nope', 6),\n",
       " ('prevarticle', 6),\n",
       " ('wwwwwwww', 6),\n",
       " ('nicolascage', 6),\n",
       " ('sociopath', 6),\n",
       " ('vikander', 6),\n",
       " ('zootpia', 5),\n",
       " ('zoogle', 5),\n",
       " ('ipaw', 5),\n",
       " ('zoomania', 5),\n",
       " ('eveything', 5),\n",
       " ('icarrot', 5),\n",
       " ('sukita', 5),\n",
       " ('zoopotia', 5),\n",
       " ('kikikukiki', 5),\n",
       " ('walkingdead', 5),\n",
       " ('diorissimo', 5),\n",
       " ('babay', 5),\n",
       " ('nankingcinema', 5),\n",
       " ('scarlette', 5),\n",
       " ('scherbatsky', 5),\n",
       " ('zhengzhi', 5),\n",
       " ('sosososo', 5),\n",
       " ('fufufu', 5),\n",
       " ('beginagain', 5),\n",
       " ('begain', 5),\n",
       " ('starsyou', 5),\n",
       " ('cityofstars', 5),\n",
       " ('reeling', 5),\n",
       " ('hhhhhhhhhhhhh', 5),\n",
       " ('withu', 5),\n",
       " ('ladygaga', 5),\n",
       " ('cooooooool', 5),\n",
       " ('impotent', 5),\n",
       " ('monkeyking', 5),\n",
       " ('ahhhh', 5),\n",
       " ('xgg', 5),\n",
       " ('jxy', 5),\n",
       " ('okok', 5),\n",
       " ('shiely', 5),\n",
       " ('shrily', 5),\n",
       " ('panle', 5),\n",
       " ('mywiz', 5),\n",
       " ('ttttt', 5),\n",
       " ('ppns', 5),\n",
       " ('ttarticle', 5),\n",
       " ('ummm', 5),\n",
       " ('zjj', 5),\n",
       " ('oyz', 5),\n",
       " ('prprprprpr', 5),\n",
       " ('relatable', 5),\n",
       " ('tedious', 5),\n",
       " ('qaqqqq', 5),\n",
       " ('xjbp', 5),\n",
       " ('panzerlied', 5),\n",
       " ('agyness', 5),\n",
       " ('jwj', 5),\n",
       " ('piaohua', 5),\n",
       " ('emmm', 5),\n",
       " ('cliches', 5),\n",
       " ('watchable', 5),\n",
       " ('almodovar', 5),\n",
       " ('hahahah', 5),\n",
       " ('tougher', 5),\n",
       " ('sonakshi', 5),\n",
       " ('bitterness', 5),\n",
       " ('entertained', 5),\n",
       " ('bingbang', 5),\n",
       " ('lindsaylohan', 5),\n",
       " ('agbb', 5),\n",
       " ('disappoint', 5),\n",
       " ('coens', 5),\n",
       " ('sucked', 5),\n",
       " ('obba', 5),\n",
       " ('juliaroberts', 5),\n",
       " ('funnier', 5),\n",
       " ('awww', 5),\n",
       " ('deniro', 5),\n",
       " ('halfcd', 5),\n",
       " ('bfsu', 5),\n",
       " ('moonriver', 5),\n",
       " ('wlgc', 5),\n",
       " ('niubility', 5),\n",
       " ('sharlto', 5),\n",
       " ('albumplay', 5),\n",
       " ('balabalabala', 5),\n",
       " ('gwy', 5),\n",
       " ('bitchy', 5),\n",
       " ('yooooo', 5),\n",
       " ('hdwing', 5),\n",
       " ('annehathaway', 5),\n",
       " ('wakanda', 5),\n",
       " ('mrbig', 4),\n",
       " ('wakawaka', 4),\n",
       " ('zoopia', 4),\n",
       " ('awwwww', 4),\n",
       " ('racialism', 4),\n",
       " ('letitgo', 4),\n",
       " ('redwimps', 4),\n",
       " ('miziha', 4),\n",
       " ('lff', 4),\n",
       " ('tboy', 4),\n",
       " ('pppps', 4),\n",
       " ('jcyt', 4),\n",
       " ('sherily', 4),\n",
       " ('sherliy', 4),\n",
       " ('lxq', 4),\n",
       " ('byw', 4),\n",
       " ('viavia', 4),\n",
       " ('crosspolo', 4),\n",
       " ('newpolo', 4),\n",
       " ('ohohoh', 4),\n",
       " ('hork', 4),\n",
       " ('outman', 4),\n",
       " ('xqq', 4),\n",
       " ('classmates', 4),\n",
       " ('agelababy', 4),\n",
       " ('withx', 4),\n",
       " ('tooooo', 4),\n",
       " ('qzone', 4),\n",
       " ('nmit', 4),\n",
       " ('sebstian', 4),\n",
       " ('aways', 4),\n",
       " ('theeafter', 4),\n",
       " ('mecity', 4),\n",
       " ('aches', 4),\n",
       " ('pursing', 4),\n",
       " ('hzw', 4),\n",
       " ('hoid', 4),\n",
       " ('hahahahahaha', 4),\n",
       " ('helluva', 4),\n",
       " ('lron', 4),\n",
       " ('cooool', 4),\n",
       " ('rtj', 4),\n",
       " ('kekeke', 4),\n",
       " ('kkkkk', 4),\n",
       " ('tmsb', 4),\n",
       " ('trailor', 4),\n",
       " ('guangdian', 4),\n",
       " ('supermen', 4),\n",
       " ('ipam', 4),\n",
       " ('shierly', 4),\n",
       " ('ojiji', 4),\n",
       " ('cxy', 4),\n",
       " ('biatch', 4),\n",
       " ('muamuamua', 4),\n",
       " ('manman', 4),\n",
       " ('soooooooooo', 4),\n",
       " ('hign', 4),\n",
       " ('hmmmmm', 4),\n",
       " ('sebby', 4),\n",
       " ('kumamon', 4),\n",
       " ('teamcaptain', 4),\n",
       " ('wyt', 4),\n",
       " ('ttttttt', 4),\n",
       " ('svb', 4),\n",
       " ('provoking', 4),\n",
       " ('diaos', 4),\n",
       " ('isappinstalled', 4),\n",
       " ('lexburner', 4),\n",
       " ('doubanio', 4),\n",
       " ('lxy', 4),\n",
       " ('qaaaq', 4),\n",
       " ('tacky', 4),\n",
       " ('ohyeah', 4),\n",
       " ('villian', 4),\n",
       " ('wmh', 4),\n",
       " ('underwhelming', 4),\n",
       " ('duangduangduangduangduang', 4),\n",
       " ('justsoso', 4),\n",
       " ('coooooool', 4),\n",
       " ('cucurrucucu', 4),\n",
       " ('fvck', 4),\n",
       " ('scumbag', 4),\n",
       " ('zenzen', 4),\n",
       " ('makus', 4),\n",
       " ('taotao', 4),\n",
       " ('yiban', 4),\n",
       " ('dady', 4),\n",
       " ('effortless', 4),\n",
       " ('sooooooooooo', 4),\n",
       " ('banal', 4),\n",
       " ('kevincostner', 4),\n",
       " ('idk', 4),\n",
       " ('toooooo', 4),\n",
       " ('pissed', 4),\n",
       " ('garrn', 4),\n",
       " ('paulnewman', 4),\n",
       " ('isrenhe', 4),\n",
       " ('lanpian', 4),\n",
       " ('cinemaxx', 4),\n",
       " ('danieldaylewis', 4),\n",
       " ('aibileen', 4),\n",
       " ('funy', 4),\n",
       " ('biangbiangbiang', 4),\n",
       " ('bugger', 4),\n",
       " ('xunlei', 4),\n",
       " ('eits', 4),\n",
       " ('cnxp', 4),\n",
       " ('unfulfilled', 4),\n",
       " ('hakunamatata', 4),\n",
       " ('tspdt', 4),\n",
       " ('claflin', 4),\n",
       " ('reservoirbuns', 4),\n",
       " ('angryalien', 4),\n",
       " ('dytt', 4),\n",
       " ('harrisonford', 4),\n",
       " ('evagreen', 4),\n",
       " ('robertdowneyjr', 4),\n",
       " ('bradpitt', 4),\n",
       " ('alpacino', 4),\n",
       " ('sooooooooo', 4),\n",
       " ('imbt', 4),\n",
       " ('weallmc', 4),\n",
       " ('unrealistic', 4),\n",
       " ('qqaqq', 4),\n",
       " ('tttattt', 4),\n",
       " ('ahaha', 4),\n",
       " ('doulist', 4),\n",
       " ('yunpan', 4),\n",
       " ('sowhat', 4),\n",
       " ('tomcruise', 4),\n",
       " ('pround', 4),\n",
       " ('balalala', 4),\n",
       " ('distracting', 4),\n",
       " ('caonimagewangbagaozi', 4),\n",
       " ('vindiesel', 4),\n",
       " ('bamf', 4),\n",
       " ('debuff', 4),\n",
       " ('youself', 4),\n",
       " ('hahha', 4),\n",
       " ('familly', 4),\n",
       " ('nill', 4),\n",
       " ('dreamwork', 4),\n",
       " ('uptobox', 4),\n",
       " ('stevenspielberg', 4),\n",
       " ('bingka', 4),\n",
       " ('coooool', 4),\n",
       " ('happyness', 4),\n",
       " ('mejudy', 3),\n",
       " ('zootapia', 3),\n",
       " ('targoat', 3),\n",
       " ('zootupia', 3),\n",
       " ('muei', 3),\n",
       " ('awwwwww', 3),\n",
       " ('zotopia', 3),\n",
       " ('mynameis', 3),\n",
       " ('radwipms', 3),\n",
       " ('itomori', 3),\n",
       " ('radwinps', 3),\n",
       " ('namae', 3),\n",
       " ('kataware', 3),\n",
       " ('chimerica', 3),\n",
       " ('biabia', 3),\n",
       " ('honer', 3),\n",
       " ('fxck', 3),\n",
       " ('remakes', 3),\n",
       " ('sequels', 3),\n",
       " ('naocan', 3),\n",
       " ('hehehehe', 3),\n",
       " ('hodl', 3),\n",
       " ('wlgq', 3),\n",
       " ('kaikaiko', 3),\n",
       " ('ketty', 3),\n",
       " ('pasotti', 3),\n",
       " ('supuer', 3),\n",
       " ('taofen', 3),\n",
       " ('yaoyao', 3),\n",
       " ('zergling', 3),\n",
       " ('lamer', 3),\n",
       " ('jingtian', 3),\n",
       " ('xjbg', 3),\n",
       " ('wenge', 3),\n",
       " ('sexist', 3),\n",
       " ('shiry', 3),\n",
       " ('willbe', 3),\n",
       " ('henhao', 3),\n",
       " ('haihao', 3),\n",
       " ('ahahah', 3),\n",
       " ('trible', 3),\n",
       " ('muamua', 3),\n",
       " ('scalett', 3),\n",
       " ('woidm', 3),\n",
       " ('coulsen', 3),\n",
       " ('chitauri', 3),\n",
       " ('semigod', 3),\n",
       " ('solemate', 3),\n",
       " ('jyy', 3),\n",
       " ('chickflick', 3),\n",
       " ('tttttttt', 3),\n",
       " ('duwai', 3),\n",
       " ('angleababy', 3),\n",
       " ('sooooooooooooo', 3),\n",
       " ('wuliao', 3),\n",
       " ('hehehehehe', 3),\n",
       " ('hohohoho', 3),\n",
       " ('hungover', 3),\n",
       " ('chrismas', 3),\n",
       " ('xuzheng', 3),\n",
       " ('fanbb', 3),\n",
       " ('worths', 3),\n",
       " ('goodend', 3),\n",
       " ('sneezing', 3),\n",
       " ('agian', 3),\n",
       " ('whatif', 3),\n",
       " ('yearhow', 3),\n",
       " ('idealistic', 3),\n",
       " ('heres', 3),\n",
       " ('pyq', 3),\n",
       " ('breakhere', 3),\n",
       " ('tbh', 3),\n",
       " ('sytycd', 3),\n",
       " ('clinche', 3),\n",
       " ('yys', 3),\n",
       " ('notbad', 3),\n",
       " ('smirk', 3),\n",
       " ('zhizhu', 3),\n",
       " ('xxxxxxxxxx', 3),\n",
       " ('sososo', 3),\n",
       " ('steampunk', 3),\n",
       " ('whh', 3),\n",
       " ('shuqi', 3),\n",
       " ('xxd', 3),\n",
       " ('lsq', 3),\n",
       " ('tttttttttttttt', 3),\n",
       " ('qwqqqq', 3),\n",
       " ('iorn', 3),\n",
       " ('transfomers', 3),\n",
       " ('yinson', 3),\n",
       " ('coool', 3),\n",
       " ('donney', 3),\n",
       " ('fushun', 3),\n",
       " ('begining', 3),\n",
       " ('repost', 3),\n",
       " ('zkw', 3),\n",
       " ('madao', 3),\n",
       " ('fack', 3),\n",
       " ('nankingpc', 3),\n",
       " ('zxm', 3),\n",
       " ('aaaaaaa', 3),\n",
       " ('zzhk', 3),\n",
       " ('engding', 3),\n",
       " ('yoooooo', 3),\n",
       " ('qnmlgbd', 3),\n",
       " ('jjf', 3),\n",
       " ('zyj', 3),\n",
       " ('xihuan', 3),\n",
       " ('eww', 3),\n",
       " ('jjw', 3),\n",
       " ('idealists', 3),\n",
       " ('cnmlb', 3),\n",
       " ('iwatch', 3),\n",
       " ('sheily', 3),\n",
       " ('cnmd', 3),\n",
       " ('unbelievably', 3),\n",
       " ('shirey', 3),\n",
       " ('gyy', 3),\n",
       " ('damons', 3),\n",
       " ('coppice', 3),\n",
       " ('wqnmlgbd', 3),\n",
       " ('moneymoneymoney', 3),\n",
       " ('profite', 3),\n",
       " ('sumsung', 3),\n",
       " ('kehuanpian', 3),\n",
       " ('arclight', 3),\n",
       " ('bbbbb', 3),\n",
       " ('haibucuo', 3),\n",
       " ('qnm', 3),\n",
       " ('thanking', 3),\n",
       " ('punchy', 3),\n",
       " ('teamtony', 3),\n",
       " ('teamiron', 3),\n",
       " ('tiantian', 3),\n",
       " ('wjm', 3),\n",
       " ('congratulation', 3),\n",
       " ('friendso', 3),\n",
       " ('tttttt', 3),\n",
       " ('nbcs', 3),\n",
       " ('lsy', 3),\n",
       " ('cpcp', 3),\n",
       " ('lalalalalala', 3),\n",
       " ('zhf', 3),\n",
       " ('teambucky', 3),\n",
       " ('fyc', 3),\n",
       " ('cliched', 3),\n",
       " ('holyshit', 3),\n",
       " ('qwqqqqq', 3),\n",
       " ('shiwang', 3),\n",
       " ('spherex', 3),\n",
       " ('balaba', 3),\n",
       " ('avangers', 3),\n",
       " ('stanlee', 3),\n",
       " ('hhha', 3),\n",
       " ('dududu', 3),\n",
       " ('balabla', 3),\n",
       " ('withhh', 3),\n",
       " ('lwj', 3),\n",
       " ('lilleugc', 3),\n",
       " ('boooooooom', 3),\n",
       " ('jcube', 3),\n",
       " ('unbearably', 3),\n",
       " ('tshirt', 3),\n",
       " ('ordin', 3),\n",
       " ('haiya', 3),\n",
       " ('abcdefg', 3),\n",
       " ('sokovia', 3),\n",
       " ('stroszek', 3),\n",
       " ('sver', 3),\n",
       " ('ohmygod', 3),\n",
       " ('bttiantang', 3),\n",
       " ('parda', 3),\n",
       " ('sifi', 3),\n",
       " ('xinxin', 3),\n",
       " ('milland', 3),\n",
       " ('perverted', 3),\n",
       " ('impeccable', 3),\n",
       " ('queenb', 3),\n",
       " ('hdchina', 3),\n",
       " ('pauldano', 3),\n",
       " ('baobao', 3),\n",
       " ('hqc', 3),\n",
       " ('shelookslikeayoungmonicavitti', 3),\n",
       " ('shabby', 3),\n",
       " ('yoooooooooo', 3),\n",
       " ('zhidao', 3),\n",
       " ('documentaire', 3),\n",
       " ('couscous', 3),\n",
       " ('omfg', 3),\n",
       " ('balabalabalabala', 3),\n",
       " ('meilleure', 3),\n",
       " ('jansport', 3),\n",
       " ('naivety', 3),\n",
       " ('grym', 3),\n",
       " ('gainsbourgcharlotte', 3),\n",
       " ('johncusack', 3),\n",
       " ('ctrlhd', 3),\n",
       " ('dualaudio', 3),\n",
       " ('fasten', 3),\n",
       " ('fking', 3),\n",
       " ('bbbbbb', 3),\n",
       " ('excause', 3),\n",
       " ('goushi', 3),\n",
       " ('bdiso', 3),\n",
       " ('befor', 3),\n",
       " ('admirably', 3),\n",
       " ('dardennes', 3),\n",
       " ('platitude', 3),\n",
       " ('taphore', 3),\n",
       " ('yoyoyo', 3),\n",
       " ('merylstreep', 3),\n",
       " ('shithole', 3),\n",
       " ('filmoteca', 3),\n",
       " ('huges', 3),\n",
       " ('chbosky', 3),\n",
       " ('psychos', 3),\n",
       " ('bullied', 3),\n",
       " ('wonderfully', 3),\n",
       " ('subtlety', 3),\n",
       " ('underestimate', 3),\n",
       " ('hughgrant', 3),\n",
       " ('poping', 3),\n",
       " ('biger', 3),\n",
       " ('mickeyrourke', 3),\n",
       " ('soulmates', 3),\n",
       " ('zyc', 3),\n",
       " ('sonoya', 3),\n",
       " ('kikikaka', 3),\n",
       " ('zzzzzzzzz', 3),\n",
       " ('magnifique', 3),\n",
       " ('yculblog', 3),\n",
       " ('denzelwashington', 3),\n",
       " ('konwn', 3),\n",
       " ('eastgame', 3),\n",
       " ('perfact', 3),\n",
       " ('guessed', 3),\n",
       " ('somethings', 3),\n",
       " ('johntravolta', 3),\n",
       " ('edwardnorton', 3),\n",
       " ('sayin', 3),\n",
       " ('trashy', 3),\n",
       " ('hahahhhh', 3),\n",
       " ('characterization', 3),\n",
       " ('dadadada', 3),\n",
       " ('bazinga', 3),\n",
       " ('oumei', 3),\n",
       " ('hdrip', 3),\n",
       " ('hollyfood', 3),\n",
       " ('hhhhhhhhhhhhhhh', 3),\n",
       " ('weibotime', 3),\n",
       " ('javu', 3),\n",
       " ('seizes', 3),\n",
       " ('dustinhoffman', 3),\n",
       " ('symbolis', 3),\n",
       " ('georgeclooney', 3),\n",
       " ('fransokyo', 3),\n",
       " ('qqqaqqq', 3),\n",
       " ('luxiang', 3),\n",
       " ('crappy', 3),\n",
       " ('simonpegg', 3),\n",
       " ('framestore', 3),\n",
       " ('rarbg', 3),\n",
       " ('hohohohoho', 3),\n",
       " ('goodluck', 3),\n",
       " ('judelaw', 3),\n",
       " ('hassells', 3),\n",
       " ('gijoe', 3),\n",
       " ('hkaff', 3),\n",
       " ('dennings', 3),\n",
       " ('panyu', 3),\n",
       " ('seehd', 3),\n",
       " ('saysaysay', 3),\n",
       " ('sarabandi', 3),\n",
       " ('fereshte', 3),\n",
       " ('seddiqi', 3),\n",
       " ('bahare', 3),\n",
       " ('farrokh', 3),\n",
       " ('katewinslet', 3),\n",
       " ('mcmurphy', 3),\n",
       " ('shxt', 3),\n",
       " ('empathetic', 3),\n",
       " ('jasonbourne', 3),\n",
       " ('buleliongb', 3),\n",
       " ('ifree', 3),\n",
       " ('dofp', 3),\n",
       " ('zooutopia', 2),\n",
       " ('whatcanidoforutoday', 2),\n",
       " ('disgustingly', 2),\n",
       " ('groundi', 2),\n",
       " ('zootopic', 2),\n",
       " ('derder', 2),\n",
       " ('iwouldn', 2),\n",
       " ('funko', 2),\n",
       " ('awwwwwwww', 2),\n",
       " ('mousey', 2),\n",
       " ('weselton', 2),\n",
       " ('buuny', 2),\n",
       " ('pofei', 2),\n",
       " ('lisben', 2),\n",
       " ('wdas', 2),\n",
       " ('multiplying', 2),\n",
       " ('rofl', 2),\n",
       " ('fearmongering', 2),\n",
       " ('keai', 2),\n",
       " ('cuteness', 2),\n",
       " ('sterotypes', 2),\n",
       " ('woolter', 2),\n",
       " ('netflex', 2),\n",
       " ('pawpsicles', 2),\n",
       " ('tgify', 2),\n",
       " ('igroom', 2),\n",
       " ('flashflashflash', 2),\n",
       " ('zzr', 2),\n",
       " ('howlers', 2),\n",
       " ('mistakestry', 2),\n",
       " ('untrustworthy', 2),\n",
       " ('ztp', 2),\n",
       " ('babyblue', 2),\n",
       " ('shikira', 2),\n",
       " ('mitshuha', 2),\n",
       " ('ipone', 2),\n",
       " ('faintly', 2),\n",
       " ('musibi', 2),\n",
       " ('niceboat', 2),\n",
       " ('radwimp', 2),\n",
       " ('takikun', 2),\n",
       " ('yoyoki', 2),\n",
       " ('mithuha', 2),\n",
       " ('xradwimps', 2),\n",
       " ('dalao', 2),\n",
       " ('resisit', 2),\n",
       " ('sikida', 2),\n",
       " ('bodyswapping', 2),\n",
       " ('namaii', 2),\n",
       " ('knotting', 2),\n",
       " ('namayiwa', 2),\n",
       " ('kimino', 2),\n",
       " ('kagaya', 2),\n",
       " ('zbh', 2),\n",
       " ('buywater', 2),\n",
       " ('dush', 2),\n",
       " ('leadship', 2),\n",
       " ('hdts', 2),\n",
       " ('sensai', 2),\n",
       " ('megen', 2),\n",
       " ('sgoyi', 2),\n",
       " ('honghonghong', 2),\n",
       " ('wokao', 2),\n",
       " ('yxf', 2),\n",
       " ('gmax', 2),\n",
       " ('tiring', 2),\n",
       " ('kakakukuqiqi', 2),\n",
       " ('biubiubiubiubiu', 2),\n",
       " ('bingbingli', 2),\n",
       " ('haapy', 2),\n",
       " ('qaqqqqq', 2),\n",
       " ('wuwuwuwu', 2),\n",
       " ('runing', 2),\n",
       " ('wym', 2),\n",
       " ('kajima', 2),\n",
       " ('wondergirl', 2),\n",
       " ('balibali', 2),\n",
       " ('sophienstrasse', 2),\n",
       " ('holdor', 2),\n",
       " ('undeniably', 2),\n",
       " ('webhd', 2),\n",
       " ('alohaoe', 2),\n",
       " ('zpyl', 2),\n",
       " ('tsks', 2),\n",
       " ('cbdevent', 2),\n",
       " ('warz', 2),\n",
       " ('soop', 2),\n",
       " ('bulingbuling', 2),\n",
       " ('apian', 2),\n",
       " ('iloveu', 2),\n",
       " ('fashionshow', 2),\n",
       " ('bdhd', 2),\n",
       " ('buling', 2),\n",
       " ('yymv', 2),\n",
       " ('guojingming', 2),\n",
       " ('linxiao', 2),\n",
       " ('ppmm', 2),\n",
       " ('judgmental', 2),\n",
       " ('campbel', 2),\n",
       " ('hsy', 2),\n",
       " ('motherfuck', 2),\n",
       " ('pky', 2),\n",
       " ('withl', 2),\n",
       " ('wodema', 2),\n",
       " ('nyhd', 2),\n",
       " ('yier', 2),\n",
       " ('whocare', 2),\n",
       " ('minzhu', 2),\n",
       " ('greatwall', 2),\n",
       " ('intelligents', 2),\n",
       " ('peoplemoveon', 2),\n",
       " ('xfgy', 2),\n",
       " ('chiglish', 2),\n",
       " ('jwyz', 2),\n",
       " ('hordor', 2),\n",
       " ('sinos', 2),\n",
       " ('oberyn', 2),\n",
       " ('prejudices', 2),\n",
       " ('liangjun', 2),\n",
       " ('goagentg', 2),\n",
       " ('whitewalk', 2),\n",
       " ('whitewashing', 2),\n",
       " ('tfboi', 2),\n",
       " ('piupiu', 2),\n",
       " ('zuozuo', 2),\n",
       " ('xiger', 2),\n",
       " ('oaq', 2),\n",
       " ('yjx', 2),\n",
       " ('changela', 2),\n",
       " ('ballball', 2),\n",
       " ('zhuangb', 2),\n",
       " ('angla', 2),\n",
       " ('angerbaby', 2),\n",
       " ('killbill', 2),\n",
       " ('daomu', 2),\n",
       " ('dmbj', 2),\n",
       " ('naturalbaby', 2),\n",
       " ('shitzu', 2),\n",
       " ('yhao', 2),\n",
       " ('overacting', 2),\n",
       " ('diaobao', 2),\n",
       " ('eeeee', 2),\n",
       " ('xzapk', 2),\n",
       " ('wfr', 2),\n",
       " ('xyp', 2),\n",
       " ('twocold', 2),\n",
       " ('quesera', 2),\n",
       " ('naocanfen', 2),\n",
       " ('lumiai', 2),\n",
       " ('yunpeng', 2),\n",
       " ('duanzi', 2),\n",
       " ('yimo', 2),\n",
       " ('hzx', 2),\n",
       " ('awkard', 2),\n",
       " ('tellible', 2),\n",
       " ('psss', 2),\n",
       " ('preachy', 2),\n",
       " ('veyia', 2),\n",
       " ('tinytimes', 2),\n",
       " ('pushu', 2),\n",
       " ('xjy', 2),\n",
       " ('inspring', 2),\n",
       " ('qtmlgb', 2),\n",
       " ('nickfury', 2),\n",
       " ('motherfuckers', 2),\n",
       " ('bongbongbongbong', 2),\n",
       " ('lokiiiiiiiiiiii', 2),\n",
       " ('niubi', 2),\n",
       " ('pefect', 2),\n",
       " ('debide', 2),\n",
       " ('lwx', 2),\n",
       " ('cocopark', 2),\n",
       " ('xxxd', 2),\n",
       " ('sooooooooooooooooo', 2),\n",
       " ('lokiloki', 2),\n",
       " ('loky', 2),\n",
       " ('guokr', 2),\n",
       " ('optimusprime', 2),\n",
       " ('fasinating', 2),\n",
       " ...]"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unknown_string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:35:10.682083Z",
     "start_time": "2020-05-14T02:35:10.675564Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "wyf: ['wy', 'f']\n",
      "qwq: ['qw', 'q']\n",
      "rdj: ['rd', 'j']\n",
      "tfboy: ['tfb', 'oy']\n",
      "shirly: ['shirl', 'y']\n",
      "hhhhh: ['hhhh', 'h']\n",
      "mdzz: ['mdz', 'z']\n",
      "exm: ['ex', 'm']\n",
      "xddd: ['xdd', 'd']\n",
      "happyending: ['happyend', 'ing']\n"
     ]
    }
   ],
   "source": [
    "for w, _ in unknown_string[:10]:\n",
    "    print(w + ':', cut(w))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:36:13.158014Z",
     "start_time": "2020-05-14T02:36:13.142097Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('eascii', 0.8281255960464478),\n",
       " ('xfc', 0.8146369457244873),\n",
       " ('dfff', 0.8047963976860046),\n",
       " ('inet', 0.7976295351982117),\n",
       " ('fullwidth', 0.7961112260818481),\n",
       " ('tab', 0.7959989309310913),\n",
       " ('tty', 0.7914590239524841),\n",
       " ('xde', 0.7877368927001953),\n",
       " ('adodb', 0.7856166362762451),\n",
       " ('afff', 0.783061146736145)]"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 训练词向量时 对英文的处理 还需完善\n",
    "wv.most_similar('xdd')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 其它"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:37:37.503304Z",
     "start_time": "2020-05-14T02:37:37.414400Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('20161220', 2),\n",
       " ('5727', 2),\n",
       " ('dd51', 1),\n",
       " ('🈳', 2),\n",
       " ('pes5', 1),\n",
       " ('20172', 2),\n",
       " ('160403', 1),\n",
       " ('20160510', 6),\n",
       " ('3y', 1),\n",
       " ('18709112708', 1),\n",
       " ('8214996', 1),\n",
       " ('064', 1),\n",
       " ('1943', 6),\n",
       " ('diablo3', 1),\n",
       " ('119', 8),\n",
       " ('7years', 1),\n",
       " ('20150225', 2),\n",
       " ('┃', 11),\n",
       " ('221', 6),\n",
       " ('20131024', 1),\n",
       " ('20160916', 12),\n",
       " ('ccc1988', 1),\n",
       " ('7s', 2),\n",
       " ('20140914', 1),\n",
       " ('130627', 3),\n",
       " ('qq1632635916', 1),\n",
       " ('ᙏ', 2),\n",
       " ('151014', 1),\n",
       " ('20074', 1),\n",
       " ('av2125855', 1),\n",
       " ('1398592', 1),\n",
       " ('😧', 24),\n",
       " ('🐍', 5),\n",
       " ('🇮', 4),\n",
       " ('⚘', 1),\n",
       " ('58539184', 1),\n",
       " ('rtdv6ix', 1),\n",
       " ('20130221', 1),\n",
       " ('dphgoxehtw4', 2),\n",
       " ('31', 310),\n",
       " ('\\ue43c', 1),\n",
       " ('\\ue411', 6),\n",
       " ('53367', 1),\n",
       " ('5ed2277a0d9b1e63d7a89cb8', 1),\n",
       " ('s4c', 1),\n",
       " ('551', 1),\n",
       " ('140720', 2),\n",
       " ('╲', 2),\n",
       " ('baby10', 1),\n",
       " ('771', 1),\n",
       " ('2147483647', 1),\n",
       " ('👦', 12),\n",
       " ('7pm', 4),\n",
       " ('40km', 1),\n",
       " ('nc16', 2),\n",
       " ('7517693', 1),\n",
       " ('20110806', 1),\n",
       " ('ifree3d', 1),\n",
       " ('qq909071999', 1),\n",
       " ('stars2333', 1),\n",
       " ('20140723', 2),\n",
       " ('tt0117589', 1),\n",
       " ('5000w', 2),\n",
       " ('tmd3d', 1),\n",
       " ('།', 16),\n",
       " ('≖', 40),\n",
       " ('5lbec5sqd18n', 1),\n",
       " ('tt0111512', 1),\n",
       " ('33', 285),\n",
       " ('╘', 2),\n",
       " ('51lefan', 1),\n",
       " ('t1a0065802', 1),\n",
       " ('a7', 1),\n",
       " ('4054062107936865', 1),\n",
       " ('91', 50),\n",
       " ('s65', 1),\n",
       " ('20150517', 17),\n",
       " ('222222222222222222222222222222', 1),\n",
       " ('20130327', 4),\n",
       " ('diana917bob', 1),\n",
       " ('ca1', 1),\n",
       " ('ᐖ', 2),\n",
       " ('523452693ed265f9de29558b4cfc1960c895b24a', 1),\n",
       " ('0116', 1),\n",
       " ('ͦ', 4),\n",
       " ('320t', 1),\n",
       " ('30you', 1),\n",
       " ('111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111',\n",
       "  1),\n",
       " ('4276', 1),\n",
       " ('1q84', 39),\n",
       " ('6136569', 1),\n",
       " ('😫', 76),\n",
       " ('160719', 2),\n",
       " ('ed2k', 1),\n",
       " ('20130102', 3),\n",
       " ('3v', 1),\n",
       " ('avengers2', 9),\n",
       " ('9856', 1),\n",
       " ('3033789831', 1),\n",
       " ('98', 175),\n",
       " ('20160507ko', 1),\n",
       " ('tv12', 1),\n",
       " ('20151018', 5),\n",
       " ('20130219', 2),\n",
       " ('pg9', 1),\n",
       " ('160616', 1),\n",
       " ('cl7p2u5h', 1),\n",
       " ('🏠', 1),\n",
       " ('0510', 2),\n",
       " ('583', 1),\n",
       " ('bw55', 1),\n",
       " ('20071108', 1),\n",
       " ('48000', 1),\n",
       " ('320', 4),\n",
       " ('1954', 5),\n",
       " ('zj022w6', 1),\n",
       " ('270', 22),\n",
       " ('fast6', 1),\n",
       " ('n3ds', 1),\n",
       " ('u2', 22),\n",
       " ('2336', 2),\n",
       " ('20110625', 1),\n",
       " ('20070813', 2),\n",
       " ('20080430', 1),\n",
       " ('2612', 1),\n",
       " ('1962', 15),\n",
       " ('🎇', 3),\n",
       " ('11at', 1),\n",
       " ('586', 3),\n",
       " ('\\ue32f', 1),\n",
       " ('s9q7zllpbnha7wo8bkklcw', 1),\n",
       " ('13jan', 1),\n",
       " ('7645438', 1),\n",
       " ('0509', 1),\n",
       " ('5d2', 4),\n",
       " ('rqdz9hw', 1),\n",
       " ('2838528435', 1),\n",
       " ('space4', 1),\n",
       " ('tiffanyyy21', 1),\n",
       " ('307', 3),\n",
       " ('15m', 1),\n",
       " ('35mm', 11),\n",
       " ('2413', 2),\n",
       " ('41', 45),\n",
       " ('2015tghff', 1),\n",
       " ('1024', 16),\n",
       " ('c7818600', 1),\n",
       " ('c3po', 1),\n",
       " ('20151228', 6),\n",
       " ('¥', 64),\n",
       " ('1300', 20),\n",
       " ('१', 8),\n",
       " ('0719', 2),\n",
       " ('017', 1),\n",
       " ('4dx', 146),\n",
       " ('🚓', 2),\n",
       " ('20081120', 1),\n",
       " ('20110224', 1),\n",
       " ('\\ue311', 1),\n",
       " ('855536', 1),\n",
       " ('448', 1),\n",
       " ('2016057', 1),\n",
       " ('📴', 1),\n",
       " ('2242351597', 1),\n",
       " ('20160930', 7),\n",
       " ('2016313', 1),\n",
       " ('1h40', 1),\n",
       " ('1599', 18),\n",
       " ('858', 1),\n",
       " ('20160508', 17),\n",
       " ('201', 5),\n",
       " ('jp3', 1),\n",
       " ('19siff', 1),\n",
       " ('🍒', 4),\n",
       " ('duedate70', 1),\n",
       " ('r20', 1),\n",
       " ('ep4', 2),\n",
       " ('0622', 1),\n",
       " ('20150428', 1),\n",
       " ('3ab', 1),\n",
       " ('5230', 3),\n",
       " ('m063', 1),\n",
       " ('q1', 1),\n",
       " ('100000000', 6),\n",
       " ('610', 1),\n",
       " ('364', 7),\n",
       " ('🎎', 1),\n",
       " ('ᵔ', 3),\n",
       " ('20121231', 4),\n",
       " ('🀄', 3),\n",
       " ('tri3', 1),\n",
       " ('s1e1', 1),\n",
       " ('021', 2),\n",
       " ('\\uf623', 1),\n",
       " ('ั', 225),\n",
       " ('2904623', 1),\n",
       " ('20151206', 1),\n",
       " ('60rmb', 2),\n",
       " ('﹉', 26),\n",
       " ('sa46', 1),\n",
       " ('20151004', 7),\n",
       " ('120101', 1),\n",
       " ('141130', 1),\n",
       " ('50s', 3),\n",
       " ('20150914', 2),\n",
       " ('qtkna80mjxtkgg', 1),\n",
       " ('🤷', 100),\n",
       " ('mrubdhw3naq', 1),\n",
       " ('61th', 1),\n",
       " ('20e', 4),\n",
       " ('xotgxmze0njq', 1),\n",
       " ('20161018', 2),\n",
       " ('69', 49),\n",
       " ('̆', 48),\n",
       " ('3920', 2),\n",
       " ('🇺', 31),\n",
       " ('20141123', 2),\n",
       " ('֒', 10),\n",
       " ('yeezy2', 2),\n",
       " ('800087322', 1),\n",
       " ('3welcome', 1),\n",
       " ('ಢ', 2),\n",
       " ('2629201', 1),\n",
       " ('20161916', 1),\n",
       " ('ऀ', 2),\n",
       " ('48m', 1),\n",
       " ('20160324', 3),\n",
       " ('8ac77682gw1f4twyuqiygj20eh0eumyd', 1),\n",
       " ('113', 7),\n",
       " ('4k120', 2),\n",
       " ('︠', 8),\n",
       " ('2682', 4),\n",
       " ('20130829', 1),\n",
       " ('no2', 6),\n",
       " ('166222', 2),\n",
       " ('20150826', 1),\n",
       " ('50w', 1),\n",
       " ('7786640', 1),\n",
       " ('6881', 1),\n",
       " ('086', 1),\n",
       " ('fw2', 1),\n",
       " ('20120512', 7),\n",
       " ('d959', 1),\n",
       " ('rqmegupqwp5h3ym', 1),\n",
       " ('2018', 28),\n",
       " ('20160507oh', 1),\n",
       " ('6768473', 1),\n",
       " ('10xnormal', 1),\n",
       " ('3c', 3),\n",
       " ('1114', 1),\n",
       " ('o2nd', 1),\n",
       " ('84253415', 1),\n",
       " ('2016036', 1),\n",
       " ('92', 47),\n",
       " ('tt999177', 2),\n",
       " ('49am', 1),\n",
       " ('😗', 15),\n",
       " ('🇰', 27),\n",
       " ('68', 56),\n",
       " ('20120224', 1),\n",
       " ('japhson4', 2),\n",
       " ('835', 1),\n",
       " ('🍅', 1),\n",
       " ('xnjmxnjkxotc2', 1),\n",
       " ('❁', 67),\n",
       " ('10gb', 1),\n",
       " ('🌕', 6),\n",
       " ('1008', 2),\n",
       " ('1311', 3),\n",
       " ('20150528', 2),\n",
       " ('20090226', 1),\n",
       " ('av1693255', 1),\n",
       " ('5w6', 1),\n",
       " ('80s90s', 1),\n",
       " ('bbx2839398862', 1),\n",
       " ('20cm', 1),\n",
       " ('512', 13),\n",
       " ('606668916', 1),\n",
       " ('h2016', 1),\n",
       " ('3467', 1),\n",
       " ('︺', 1),\n",
       " ('320201', 1),\n",
       " ('1927', 3),\n",
       " ('20130224', 5),\n",
       " ('7494126', 2),\n",
       " ('45min', 3),\n",
       " ('120804', 1),\n",
       " ('p1', 2),\n",
       " ('3e', 1),\n",
       " ('16wizs', 1),\n",
       " ('a59afaf19b42dcaafb79263aae6fec32', 1),\n",
       " ('※', 14),\n",
       " ('20120725', 1),\n",
       " ('100rmb', 2),\n",
       " ('1464x1080', 2),\n",
       " ('e100076', 3),\n",
       " ('94032346', 1),\n",
       " ('㡳', 1),\n",
       " ('c18d', 2),\n",
       " ('bd056', 1),\n",
       " ('2d3d', 16),\n",
       " ('071020', 1),\n",
       " ('20130119', 3),\n",
       " ('rjzv3msojrw', 1),\n",
       " ('2ffacecool', 1),\n",
       " ('2012oct23', 2),\n",
       " ('20160220with', 1),\n",
       " ('k9c8cbdhnuue', 1),\n",
       " ('🌏', 1),\n",
       " ('2308ec70', 2),\n",
       " ('170206', 1),\n",
       " ('2017020', 1),\n",
       " ('153', 9),\n",
       " ('f9f93311c4', 1),\n",
       " ('12h', 1),\n",
       " ('4913', 1),\n",
       " ('280kg', 3),\n",
       " ('4f', 2),\n",
       " ('1366891', 1),\n",
       " ('1983', 19),\n",
       " ('m9', 2),\n",
       " ('❄', 2),\n",
       " ('1998', 51),\n",
       " ('25cents', 1),\n",
       " ('20121028', 1),\n",
       " ('418', 1),\n",
       " ('072614', 1),\n",
       " ('20160618', 2),\n",
       " ('tony10', 1),\n",
       " ('room1', 2),\n",
       " ('4778', 1),\n",
       " ('⁼', 40),\n",
       " ('1234567', 13),\n",
       " ('20150618', 1),\n",
       " ('5200', 2),\n",
       " ('flowers4', 1),\n",
       " ('90575406', 1),\n",
       " ('067', 1),\n",
       " ('20170203', 5),\n",
       " ('8910', 2),\n",
       " ('1018', 2),\n",
       " ('hard3', 2),\n",
       " ('670408364', 2),\n",
       " ('741', 1),\n",
       " ('ꄬ', 2),\n",
       " ('782', 2),\n",
       " ('20160828', 1),\n",
       " ('3058327', 1),\n",
       " ('20170105', 3),\n",
       " ('1430193', 1),\n",
       " ('plus4', 1),\n",
       " ('4887859465', 1),\n",
       " ('4a', 1),\n",
       " ('0427bgm8fowuak80jbudny3n', 1),\n",
       " ('1220', 4),\n",
       " ('376m', 1),\n",
       " ('target10', 1),\n",
       " ('0314', 2),\n",
       " ('2baby', 2),\n",
       " ('081003', 1),\n",
       " ('0000000000000', 1),\n",
       " ('151011', 3),\n",
       " ('201512', 3),\n",
       " ('mx4d', 15),\n",
       " ('eiff2014', 1),\n",
       " ('3959', 1),\n",
       " ('02042230', 1),\n",
       " ('57879606', 1),\n",
       " ('120cm', 1),\n",
       " ('151720', 1),\n",
       " ('2304189ed65add0102vixp', 1),\n",
       " ('20130101', 6),\n",
       " ('t0t', 16),\n",
       " ('2017', 1632),\n",
       " ('oad2', 1),\n",
       " ('183cm', 2),\n",
       " ('180cm', 1),\n",
       " ('201645', 1),\n",
       " ('nm3176450', 1),\n",
       " ('33gb', 1),\n",
       " ('1h55m25s', 1),\n",
       " ('ac3', 31),\n",
       " ('535', 2),\n",
       " ('211985', 1),\n",
       " ('140', 185),\n",
       " ('280min', 1),\n",
       " ('a8l', 3),\n",
       " ('13366053217', 1),\n",
       " ('4s', 10),\n",
       " ('7610', 3),\n",
       " ('10km', 1),\n",
       " ('ଓ', 6),\n",
       " ('2a', 1),\n",
       " ('polo1', 3),\n",
       " ('3er5rk', 1),\n",
       " ('↓', 19),\n",
       " ('e1v', 1),\n",
       " ('4536', 1),\n",
       " ('\\ue418', 7),\n",
       " ('e127', 1),\n",
       " ('27253', 3),\n",
       " ('702', 1),\n",
       " ('141129', 1),\n",
       " ('͇', 1),\n",
       " ('around30', 1),\n",
       " ('zzjjrr0', 1),\n",
       " ('🎭', 2),\n",
       " ('54895', 1),\n",
       " ('pm51', 1),\n",
       " ('8105163', 1),\n",
       " ('0521', 2),\n",
       " ('363487820', 1),\n",
       " ('↔', 1),\n",
       " ('🎵', 129),\n",
       " ('1322', 1),\n",
       " ('ಥ', 624),\n",
       " ('2012july7', 2),\n",
       " ('128', 29),\n",
       " ('1945', 8),\n",
       " ('kumamon20160512', 1),\n",
       " ('m48', 1),\n",
       " ('84th', 1),\n",
       " ('₊', 3),\n",
       " ('3hrs', 1),\n",
       " ('7785377', 1),\n",
       " ('༿', 1),\n",
       " ('160304', 3),\n",
       " ('bili88', 1),\n",
       " ('pn1997', 1),\n",
       " ('̯', 30),\n",
       " ('😌', 529),\n",
       " ('1302644', 2),\n",
       " ('comic1', 1),\n",
       " ('354', 1),\n",
       " ('r2', 14),\n",
       " ('qq1260630802', 1),\n",
       " ('tt0114126', 2),\n",
       " ('10with', 3),\n",
       " ('🐧', 1),\n",
       " ('20090823', 2),\n",
       " ('1923', 3),\n",
       " ('die233333', 1),\n",
       " ('141min', 2),\n",
       " ('823', 1),\n",
       " ('x1997', 1),\n",
       " ('z108', 1),\n",
       " ('28596', 2),\n",
       " ('20151007with', 1),\n",
       " ('20120502', 1),\n",
       " ('🏆', 4),\n",
       " ('༄', 2),\n",
       " ('༵', 7),\n",
       " ('0503', 2),\n",
       " ('1champ', 1),\n",
       " ('siff07', 1),\n",
       " ('6at', 1),\n",
       " ('140802', 4),\n",
       " ('m7', 2),\n",
       " ('2824196627', 1),\n",
       " ('5gn', 2),\n",
       " ('0106', 1),\n",
       " ('20160518', 3),\n",
       " ('66666666666666666666', 1),\n",
       " ('403271052', 1),\n",
       " ('9e44c0750102uy0e', 1),\n",
       " ('𢤦', 1),\n",
       " ('d300s', 1),\n",
       " ('20160211', 6),\n",
       " ('m1', 4),\n",
       " ('773', 5),\n",
       " ('1293603383', 1),\n",
       " ('20120527', 2),\n",
       " ('567', 4),\n",
       " ('6136203', 1),\n",
       " ('0516', 3),\n",
       " ('20151016', 5),\n",
       " ('678910', 2),\n",
       " ('╱', 4),\n",
       " ('﹑', 1),\n",
       " ('20160120', 1),\n",
       " ('xmjq0mdgyndk2', 2),\n",
       " ('93481744863', 2),\n",
       " ('twilight3', 1),\n",
       " ('﹠', 1),\n",
       " ('151003', 1),\n",
       " ('xntgymtc5otyw', 1),\n",
       " ('189093', 1),\n",
       " ('10086fen', 1),\n",
       " ('ᵌ', 12),\n",
       " ('20130106', 3),\n",
       " ('201702141a', 1),\n",
       " ('🐆', 6),\n",
       " ('ꈊ', 9),\n",
       " ('3cm', 1),\n",
       " ('27cgv', 1),\n",
       " ('561', 1),\n",
       " ('2h28min', 1),\n",
       " ('home2016312', 1),\n",
       " ('g119', 1),\n",
       " ('mk1', 3),\n",
       " ('24fps', 1),\n",
       " ('϶', 2),\n",
       " ('6v1', 2),\n",
       " ('✄', 3),\n",
       " ('434', 1),\n",
       " ('3871516', 1),\n",
       " ('26xy', 1),\n",
       " ('100014', 2),\n",
       " ('20160103', 2),\n",
       " ('☔', 12),\n",
       " ('k3k3', 1),\n",
       " ('q29409', 1),\n",
       " ('20160216', 4),\n",
       " ('201301', 2),\n",
       " ('1418019', 1),\n",
       " ('25h', 1),\n",
       " ('2233514', 1),\n",
       " ('160213withbing', 1),\n",
       " ('ꌂ', 4),\n",
       " ('61724', 1),\n",
       " ('20160116', 2),\n",
       " ('400w', 1),\n",
       " ('8z', 1),\n",
       " ('48558', 1),\n",
       " ('﹁', 146),\n",
       " ('201604', 4),\n",
       " ('161107', 2),\n",
       " ('day8', 1),\n",
       " ('h2', 15),\n",
       " ('9322', 1),\n",
       " ('20161230', 2),\n",
       " ('071', 2),\n",
       " ('›', 13),\n",
       " ('028', 1),\n",
       " ('akb48', 17),\n",
       " ('8000', 26),\n",
       " ('ॄ', 1),\n",
       " ('﹂', 12),\n",
       " ('7969003', 1),\n",
       " ('23min', 1),\n",
       " ('１', 84),\n",
       " ('5714', 3),\n",
       " ('x2', 31),\n",
       " ('290204', 2),\n",
       " ('121223', 2),\n",
       " ('170212vlive', 1),\n",
       " ('8r8iovyj0f7x', 1),\n",
       " ('moto1200', 1),\n",
       " ('04192014', 1),\n",
       " ('2333333333333333333333', 1),\n",
       " ('283013556', 1),\n",
       " ('2017219', 1),\n",
       " ('0108', 1),\n",
       " ('zc1000', 1),\n",
       " ('150202', 1),\n",
       " ('435', 1),\n",
       " ('13000000', 1),\n",
       " ('⭕', 6),\n",
       " ('201408142343', 1),\n",
       " ('7981391', 1),\n",
       " ('m68', 1),\n",
       " ('👽', 46),\n",
       " ('fz11', 2),\n",
       " ('bgm120', 1),\n",
       " ('20140908', 1),\n",
       " ('wow5', 1),\n",
       " ('ova2', 1),\n",
       " ('7496761', 1),\n",
       " ('mjm5mja0mtk4ma', 1),\n",
       " ('20161112', 2),\n",
       " ('⊰', 1),\n",
       " ('1892118722', 1),\n",
       " ('116114', 1),\n",
       " ('15ipad', 1),\n",
       " ('49pm', 1),\n",
       " ('520110696283', 1),\n",
       " ('\\ue442', 2),\n",
       " ('20150924', 3),\n",
       " ('wl27', 1),\n",
       " ('3ftopnav', 1),\n",
       " ('4w', 1),\n",
       " ('2016042', 1),\n",
       " ('➕', 580),\n",
       " ('best3', 1),\n",
       " ('g101', 1),\n",
       " ('20160210', 13),\n",
       " ('a4', 15),\n",
       " ('p2p', 3),\n",
       " ('0713', 1),\n",
       " ('⃔', 2),\n",
       " ('z1cosr5vbwe', 1),\n",
       " ('20120509', 2),\n",
       " ('091', 2),\n",
       " ('20160822', 1),\n",
       " ('246', 7),\n",
       " ('11yrer', 1),\n",
       " ('20120626', 1),\n",
       " ('06june', 1),\n",
       " ('29th', 3),\n",
       " ('20140731', 5),\n",
       " ('3140', 1),\n",
       " ('7180016', 2),\n",
       " ('🎷', 7),\n",
       " ('90m', 1),\n",
       " ('ac130b', 1),\n",
       " ('14353810857', 1),\n",
       " ('604', 3),\n",
       " ('20161129', 1),\n",
       " ('160515', 4),\n",
       " ('11wc', 1),\n",
       " ('20150107', 1),\n",
       " ('๏', 2),\n",
       " ('1294183', 1),\n",
       " ('10captain', 1),\n",
       " ('151219wjaqh', 1),\n",
       " ('150305', 1),\n",
       " ('160311', 3),\n",
       " ('01aug', 1),\n",
       " ('275', 9),\n",
       " ('10000000000', 3),\n",
       " ('800m', 1),\n",
       " ('buff233', 1),\n",
       " ('2ftrack', 1),\n",
       " ('west2013', 1),\n",
       " ('529', 1),\n",
       " ('66666666', 6),\n",
       " ('1h29min', 1),\n",
       " ('131103', 1),\n",
       " ('ɷ', 2),\n",
       " ('∨', 2),\n",
       " ('20160526', 2),\n",
       " ('20110520', 1),\n",
       " ('2hrs', 2),\n",
       " ('ck2', 1),\n",
       " ('20130217', 1),\n",
       " ('﹖', 1),\n",
       " ('҈', 5),\n",
       " ('🌫', 2),\n",
       " ('6to23', 1),\n",
       " ('1530', 1),\n",
       " ('6', 9342),\n",
       " ('20150322', 1),\n",
       " ('547828710', 1),\n",
       " ('871', 1),\n",
       " ('20170128', 13),\n",
       " ('av2951267', 1),\n",
       " ('ູ', 7),\n",
       " ('30mv', 1),\n",
       " ('1733', 1),\n",
       " ('ch6rmnlxhtp76', 1),\n",
       " ('26april', 1),\n",
       " ('📽', 5),\n",
       " ('20141220yl', 1),\n",
       " ('🍚', 1),\n",
       " ('═', 9),\n",
       " ('23685', 1),\n",
       " ('ᴥ', 19),\n",
       " ('1830', 1),\n",
       " ('ﻭ', 1),\n",
       " ('20151012', 4),\n",
       " ('66666666666', 3),\n",
       " ('071218', 1),\n",
       " ('av2', 1),\n",
       " ('1999', 62),\n",
       " ('152', 5),\n",
       " ('b7', 4),\n",
       " ('\\ue52c', 1),\n",
       " ('wl24', 1),\n",
       " ('20170224', 4),\n",
       " ('aqz1u4fj', 1),\n",
       " ('05hbv', 1),\n",
       " ('cxj24394', 2),\n",
       " ('7888762', 1),\n",
       " ('o3o', 2),\n",
       " ('20170219', 11),\n",
       " ('‐', 5),\n",
       " ('080121', 3),\n",
       " ('6666666666666666666666666', 1),\n",
       " ('2014top10', 1),\n",
       " ('ؒ', 12),\n",
       " ('−', 25),\n",
       " ('🎏', 7),\n",
       " ('2012oct12', 1),\n",
       " ('729ifree3d', 1),\n",
       " ('1v2', 15),\n",
       " ('126min', 4),\n",
       " ('🔻', 2),\n",
       " ('2012sep18', 1),\n",
       " ('20130118', 3),\n",
       " ('3idiots', 1),\n",
       " ('䆳', 1),\n",
       " ('2e', 2),\n",
       " ('1885', 1),\n",
       " ('20150426', 3),\n",
       " ('🍜', 5),\n",
       " ('⚾', 2),\n",
       " ('gay20140720', 1),\n",
       " ('█', 19),\n",
       " ('🔐', 1),\n",
       " ('20120610', 2),\n",
       " ('20090926', 1),\n",
       " ('🌌', 9),\n",
       " ('₌', 2),\n",
       " ('adverformers4', 1),\n",
       " ('4180445', 1),\n",
       " ('141001', 1),\n",
       " ('140728', 4),\n",
       " ('gm339021671', 1),\n",
       " ('🐢', 3),\n",
       " ('\\uf614', 1),\n",
       " ('x6', 3),\n",
       " ('40k', 1),\n",
       " ('cc00xndq1nzuxng', 1),\n",
       " ('10000011', 1),\n",
       " ('💖', 80),\n",
       " ('0220', 1),\n",
       " ('jb5', 1),\n",
       " ('👣', 4),\n",
       " ('20120506', 5),\n",
       " ('⁍', 115),\n",
       " ('1000w', 3),\n",
       " ('6545434323217', 1),\n",
       " ('ver3', 1),\n",
       " ('2014124', 1),\n",
       " ('🔙', 1),\n",
       " ('㴾', 3),\n",
       " ('p2363241517', 1),\n",
       " ('20130510', 1),\n",
       " ('160123', 1),\n",
       " ('low13', 1),\n",
       " ('b9', 3),\n",
       " ('100', 3125),\n",
       " ('mv10004', 1),\n",
       " ('20140901', 1),\n",
       " ('20151101', 1),\n",
       " ('↖', 83),\n",
       " ('6080', 1),\n",
       " ('70s80s', 1),\n",
       " ('ac1986709', 1),\n",
       " ('161028', 1),\n",
       " ('🕶', 4),\n",
       " ('4346272', 1),\n",
       " ('crossover2', 1),\n",
       " ('㕛', 1),\n",
       " ('1941', 8),\n",
       " ('828', 1),\n",
       " ('161016', 2),\n",
       " ('20151006withlizzy', 1),\n",
       " ('4349', 1),\n",
       " ('2333dean', 1),\n",
       " ('20160615', 1),\n",
       " ('6pm', 1),\n",
       " ('6379639', 1),\n",
       " ('20100725', 2),\n",
       " ('‿', 82),\n",
       " ('at2013', 1),\n",
       " ('5841', 1),\n",
       " ('img3', 4),\n",
       " ('160913', 1),\n",
       " ('faith710', 1),\n",
       " ('002', 2),\n",
       " ('jc58', 1),\n",
       " ('xbb1897', 1),\n",
       " ('5404838', 8),\n",
       " ('20141212', 1),\n",
       " ('qio284', 1),\n",
       " ('tm5', 2),\n",
       " ('orz8', 1),\n",
       " ('25934014', 1),\n",
       " ('b3', 9),\n",
       " ('5888080', 1),\n",
       " ('😠', 78),\n",
       " ('2800', 3),\n",
       " ('2h', 21),\n",
       " ('160', 168),\n",
       " ('201711', 1),\n",
       " ('11048', 2),\n",
       " ('zl11166618', 1),\n",
       " ('🇷', 24),\n",
       " ('080618', 1),\n",
       " ('607835687', 1),\n",
       " ('20130222ty', 1),\n",
       " ('0714', 1),\n",
       " ('298', 4),\n",
       " ('0000000000001', 1),\n",
       " ('1491k', 1),\n",
       " ('by2016', 1),\n",
       " ('348', 1),\n",
       " ('mi6', 2),\n",
       " ('20170214', 33),\n",
       " ('t5', 2),\n",
       " ('2015best', 1),\n",
       " ('1020', 1),\n",
       " ('qq2942006021', 2),\n",
       " ('0310', 1),\n",
       " ('71', 36),\n",
       " ('180504', 2),\n",
       " ('3839', 1),\n",
       " ('450km', 1),\n",
       " ('161', 3),\n",
       " ('20150601', 1),\n",
       " ('4th', 8),\n",
       " ('20120611', 1),\n",
       " ('23333', 635),\n",
       " ('150715', 2),\n",
       " ('2w6', 1),\n",
       " ('3221', 1),\n",
       " ('101', 312),\n",
       " ('2016116', 1),\n",
       " ('8340044', 1),\n",
       " ('av1707160', 1),\n",
       " ('48fps', 2),\n",
       " ('129', 13),\n",
       " ('꒰', 20),\n",
       " ('84698176', 1),\n",
       " ('20120820', 1),\n",
       " ('8083627', 1),\n",
       " ('12139473333', 1),\n",
       " ('tat3', 1),\n",
       " ('bffov35cn', 1),\n",
       " ('xnzewntm4odq', 2),\n",
       " ('angela860429', 1),\n",
       " ('5880', 1),\n",
       " ('36727', 1),\n",
       " ('315', 5),\n",
       " ('510', 2),\n",
       " ('ᶘ', 4),\n",
       " ('27003912', 1),\n",
       " ('u571', 1),\n",
       " ('20130311', 1),\n",
       " ('20150604', 1),\n",
       " ('8823', 1),\n",
       " ('0303', 2),\n",
       " ('20170617', 1),\n",
       " ('40th', 1),\n",
       " ('쮼', 1),\n",
       " ('🍦', 5),\n",
       " ('🐠', 48),\n",
       " ('p1000', 2),\n",
       " ('20130302', 2),\n",
       " ('21', 827),\n",
       " ('3d22', 1),\n",
       " ('25mins', 1),\n",
       " ('10086', 85),\n",
       " ('0924', 2),\n",
       " ('7529662', 2),\n",
       " ('20150707', 1),\n",
       " ('5705551', 1),\n",
       " ('22222222', 1),\n",
       " ('20160328', 2),\n",
       " ('onizuka47', 1),\n",
       " ('⊥', 2),\n",
       " ('ac1994371', 1),\n",
       " ('2309403972475904988253', 1),\n",
       " ('160713', 2),\n",
       " ('ds23', 1),\n",
       " ('👗', 19),\n",
       " ('༣', 3),\n",
       " ('2t', 3),\n",
       " ('20140811', 1),\n",
       " ('499', 3),\n",
       " ('2911', 1),\n",
       " ('1609', 1),\n",
       " ('1995', 50),\n",
       " ('177cm', 2),\n",
       " ('160209', 3),\n",
       " ('avip226', 1),\n",
       " ('107', 55),\n",
       " ('1959', 4),\n",
       " ('ੇ', 1),\n",
       " ('e4', 15),\n",
       " ('hiber58eri1m2098', 1),\n",
       " ('moma2015', 1),\n",
       " ('20130505', 1),\n",
       " ('ཫ', 2),\n",
       " ('f11', 1),\n",
       " ('20150606', 1),\n",
       " ('2016022701', 1),\n",
       " ('1209600', 2),\n",
       " ('h1b', 1),\n",
       " ('70739', 2),\n",
       " ('5650990758', 1),\n",
       " ('͎', 1),\n",
       " ('10261119', 1),\n",
       " ('160918', 3),\n",
       " ('708090', 1),\n",
       " ('55945', 1),\n",
       " ('hdy360', 1),\n",
       " ('7mia', 1),\n",
       " ('20151211', 1),\n",
       " ('126242243', 1),\n",
       " ('20140203', 1),\n",
       " ('h1083059083', 1),\n",
       " ('84', 62),\n",
       " ('170220', 1),\n",
       " ('xa5j', 1),\n",
       " ('6with', 3),\n",
       " ('2000qb', 1),\n",
       " ('oad1', 1),\n",
       " ('140629', 1),\n",
       " ('🃏', 2),\n",
       " ('37w', 1),\n",
       " ('1108', 3),\n",
       " ('250', 125),\n",
       " ('22543', 1),\n",
       " ('1933', 82),\n",
       " ('201703', 2),\n",
       " ('210357758', 2),\n",
       " ('s1', 7),\n",
       " ('2015071701', 1),\n",
       " ('20140907', 1),\n",
       " ('7533321', 1),\n",
       " ('9798', 1),\n",
       " ('301341', 1),\n",
       " ('1808', 1),\n",
       " ('66666', 45),\n",
       " ('river23', 1),\n",
       " ('american3', 1),\n",
       " ('tony3', 1),\n",
       " ('🚤', 1),\n",
       " ('1004', 1),\n",
       " ('🗼', 3),\n",
       " ('20171218', 1),\n",
       " ('0129', 2),\n",
       " ('120528', 1),\n",
       " ('📣', 4),\n",
       " ('0729', 1),\n",
       " ('⁽', 90),\n",
       " ('157', 7),\n",
       " ('35am', 1),\n",
       " ('20130421', 1),\n",
       " ('5ccd0d0c70960321d3', 1),\n",
       " ('✘', 22),\n",
       " ('6764158', 2),\n",
       " ('0817', 1),\n",
       " ('368', 2),\n",
       " ('॓', 1),\n",
       " ('20170608', 1),\n",
       " ('0214', 7),\n",
       " ('🍪', 2),\n",
       " ('🔫', 22),\n",
       " ('♪', 126),\n",
       " ('607125430', 1),\n",
       " ('20121125', 1),\n",
       " ('24a4', 1),\n",
       " ('20150518', 8),\n",
       " ('1920', 1),\n",
       " ('dota2', 13),\n",
       " ('27cinema14', 1),\n",
       " ('1280124963', 1),\n",
       " ('soi2', 1),\n",
       " ('e1', 2),\n",
       " ('124', 22),\n",
       " ('cctv10', 4),\n",
       " ('20160228', 4),\n",
       " ('111111111', 1),\n",
       " ('7553680', 1),\n",
       " ('49th', 1),\n",
       " ('3001', 1),\n",
       " ('1987', 22),\n",
       " ('48747088', 2),\n",
       " ('1936', 3),\n",
       " ('2015siff', 9),\n",
       " ('tony100', 1),\n",
       " ('20160504', 2),\n",
       " ('10am', 2),\n",
       " ('081123', 1),\n",
       " ('㊙', 1),\n",
       " ('x100', 3),\n",
       " ('981', 1),\n",
       " ('👙', 4),\n",
       " ('▀', 16),\n",
       " ('23333333333333', 6),\n",
       " ('⍛', 4),\n",
       " ('c0c0', 1),\n",
       " ('30am', 3),\n",
       " ('20130305', 1),\n",
       " ('⊱', 2),\n",
       " ('75most', 1),\n",
       " ('f1682340o1p26', 1),\n",
       " ('nzgwzpenjjggf96c5ftqmq', 1),\n",
       " ('08marvel', 1),\n",
       " ('5sing', 2),\n",
       " ('😝', 258),\n",
       " ('1859', 1),\n",
       " ('e0befba921ff7cc558e33ce2b37dece4501b1777', 1),\n",
       " ('20081216', 1),\n",
       " ('20120510', 3),\n",
       " ('˔', 2),\n",
       " ('2w', 2),\n",
       " ('11th', 3),\n",
       " ('mk43', 1),\n",
       " ('baby5', 3),\n",
       " ('03', 560),\n",
       " ('3dcg', 7),\n",
       " ('g5', 1),\n",
       " ('grmad2vqr98', 2),\n",
       " ('18cinema', 1),\n",
       " ('u2be', 1),\n",
       " ('201202', 1),\n",
       " ...]"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unknown_others = list(\n",
    "    set(unknown_words) - set(unknown_chinese) - set(unknown_string))\n",
    "unknown_others\n",
    "\n",
    "# 剩下的基本 由数字、英文字符、表情混合组成"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T02:41:27.450404Z",
     "start_time": "2020-05-14T02:41:26.994285Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'hhhhh': 'h',\n",
       " 'hhhhhh': 'h',\n",
       " 'hhhhhhh': 'h',\n",
       " 'wwwww': 'w',\n",
       " 'xdddd': 'xd',\n",
       " 'sooooo': 'so',\n",
       " 'xddddd': 'xd',\n",
       " 'hhhhhhhhh': 'h',\n",
       " 'soooooo': 'so',\n",
       " 'wwwwww': 'w',\n",
       " 'sooooooo': 'so',\n",
       " 'awwww': 'aw',\n",
       " 'hhhhhhhhhhh': 'h',\n",
       " 'tttt': 't',\n",
       " 'cooooool': 'col',\n",
       " 'wwwwwww': 'w',\n",
       " 'soooooooo': 'so',\n",
       " 'zzzzz': 'z',\n",
       " 'hhhhhhhhhh': 'h',\n",
       " 'yooooooo': 'yo',\n",
       " 'yoooooooo': 'yo',\n",
       " 'xdddddd': 'xd',\n",
       " 'xddddddd': 'xd',\n",
       " 'hmmmm': 'hm',\n",
       " 'wwwwwwwww': 'w',\n",
       " 'wwwwwwww': 'w',\n",
       " 'hhhhhhhhhhhhh': 'h',\n",
       " 'cooooooool': 'col',\n",
       " 'ahhhh': 'ah',\n",
       " 'ttttt': 't',\n",
       " 'qaqqqq': 'qaq',\n",
       " 'yooooo': 'yo',\n",
       " 'awwwww': 'aw',\n",
       " 'pppps': 'ps',\n",
       " 'tooooo': 'to',\n",
       " 'cooool': 'col',\n",
       " 'kkkkk': 'k',\n",
       " 'soooooooooo': 'so',\n",
       " 'hmmmmm': 'hm',\n",
       " 'ttttttt': 't',\n",
       " 'coooooool': 'col',\n",
       " 'sooooooooooo': 'so',\n",
       " 'toooooo': 'to',\n",
       " 'sooooooooo': 'so',\n",
       " 'coooool': 'col',\n",
       " 'awwwwww': 'aw',\n",
       " 'tttttttt': 't',\n",
       " 'sooooooooooooo': 'so',\n",
       " 'xxxxxxxxxx': 'x',\n",
       " 'tttttttttttttt': 't',\n",
       " 'qwqqqq': 'qwq',\n",
       " 'aaaaaaa': 'a',\n",
       " 'yoooooo': 'yo',\n",
       " 'bbbbb': 'b',\n",
       " 'tttttt': 't',\n",
       " 'qwqqqqq': 'qwq',\n",
       " 'boooooooom': 'bom',\n",
       " 'yoooooooooo': 'yo',\n",
       " 'bbbbbb': 'b',\n",
       " 'zzzzzzzzz': 'z',\n",
       " 'hahahhhh': 'hahah',\n",
       " 'hhhhhhhhhhhhhhh': 'h',\n",
       " 'awwwwwwww': 'aw',\n",
       " 'qaqqqqq': 'qaq',\n",
       " 'eeeee': 'e',\n",
       " 'lokiiiiiiiiiiii': 'loki',\n",
       " 'sooooooooooooooooo': 'so',\n",
       " 'cooooooooooooooool': 'col',\n",
       " 'xddddddddd': 'xd',\n",
       " 'soooooooooooo': 'so',\n",
       " 'coooooooooooooooooooooooooool': 'col',\n",
       " 'xxxxxxxxxxx': 'x',\n",
       " 'ohhhh': 'oh',\n",
       " 'hhhhhhhhhhhhhhhhh': 'h',\n",
       " 'hhhhhhhhhhhhhhhhhhhhhhh': 'h',\n",
       " 'orzzzzzzz': 'orz',\n",
       " 'qaaaaq': 'qaq',\n",
       " 'hmmmmmm': 'hm',\n",
       " 'qqqqqqq': 'q',\n",
       " 'wwwwwwwwww': 'w',\n",
       " 'coooooooooooool': 'col',\n",
       " 'wwwwwwwwwwww': 'w',\n",
       " 'haaaa': 'ha',\n",
       " 'ewwwwwwww': 'ew',\n",
       " 'yoooooooooooooo': 'yo',\n",
       " 'wooooooow': 'wow',\n",
       " 'boooom': 'bom',\n",
       " 'orzzzz': 'orz',\n",
       " 'aaaaaaaaa': 'a',\n",
       " 'looooooove': 'love',\n",
       " 'looooooooooooooove': 'love',\n",
       " 'uxxxx': 'ux',\n",
       " 'bbbbbbbbbbbbbbbb': 'b',\n",
       " 'hhhhhhhhhhhh': 'h',\n",
       " 'loooooove': 'love',\n",
       " 'nooooooo': 'no',\n",
       " 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbest': 'best',\n",
       " 'schultzzzzzzzzz': 'schultz',\n",
       " 'noooooo': 'no',\n",
       " 'emmmmmmmmmmm': 'em',\n",
       " 'haahaaaaa': 'haaha',\n",
       " 'freeeeeeeeeeeeeeedom': 'fredom',\n",
       " 'boooooooring': 'boring',\n",
       " 'bbbbbbbbbest': 'best',\n",
       " 'woooooooooooooooooooooooooow': 'wow',\n",
       " 'nbbbbbbbbbb': 'nb',\n",
       " 'nickkkkk': 'nick',\n",
       " 'everrrrrrr': 'ever',\n",
       " 'hhhhhhhhhaaaaaaaaaa': 'ha',\n",
       " 'ufghfjjjjjbk': 'ufghfjbk',\n",
       " 'woooow': 'wow',\n",
       " 'hahahhhhh': 'hahah',\n",
       " 'soooocute': 'socute',\n",
       " 'loooose': 'lose',\n",
       " 'cuteeeeeeee': 'cute',\n",
       " 'hotttttt': 'hot',\n",
       " 'gooooooooooooooooood': 'god',\n",
       " 'highhhh': 'high',\n",
       " 'flashhhhhhhh': 'flash',\n",
       " 'flashhhhhh': 'flash',\n",
       " 'hahahahahhhh': 'hahahahah',\n",
       " 'hahahahhhh': 'hahahah',\n",
       " 'fffffffflllllllllaaaaaaassssshhhhhhhh': 'flash',\n",
       " 'cuuuuuuuute': 'cute',\n",
       " 'aaaaaaaaaaaahhhhhhhhhh': 'ah',\n",
       " 'aaaaaaaaaaaaawsome': 'awsome',\n",
       " 'awwwwwwwww': 'aw',\n",
       " 'chieeeeef': 'chief',\n",
       " 'prankkkkkkkkkkkkkk': 'prank',\n",
       " 'fiiiiiiiiiiiiiiiiine': 'fine',\n",
       " 'cuuuuuute': 'cute',\n",
       " 'amazeballsssss': 'amazeballs',\n",
       " 'clyyyyyyy': 'cly',\n",
       " 'rooooocks': 'rocks',\n",
       " 'ppppppppops': 'pops',\n",
       " 'sweeeeeet': 'swet',\n",
       " 'goooooood': 'god',\n",
       " 'dieeeeeeeeeeeeeeeeeeeeeeeeeeee': 'die',\n",
       " 'niiiiiiiiickkkkk': 'nick',\n",
       " 'flaaaaaaaaashhhhhh': 'flash',\n",
       " 'sweeeeeeeeeeeeeeeeet': 'swet',\n",
       " 'fluuuuuuuuuuuuffy': 'fluffy',\n",
       " 'xhhhhhhhhh': 'xh',\n",
       " 'ffffffffurry': 'furry',\n",
       " 'xdddddddddd': 'xd',\n",
       " 'fluffyyyyyyy': 'fluffy',\n",
       " 'loooove': 'love',\n",
       " 'awwwwwwwwwwwwwwwwwwwwwwwwwwwww': 'aw',\n",
       " 'cuuuuutttttteeeeee': 'cute',\n",
       " 'cuuuute': 'cute',\n",
       " 'cuuuuute': 'cute',\n",
       " 'gooooo': 'go',\n",
       " 'flyyyyyy': 'fly',\n",
       " 'greeeeeeeat': 'great',\n",
       " 'looooo': 'lo',\n",
       " 'gooooooooooood': 'god',\n",
       " 'aaaaaaaaaaa': 'a',\n",
       " 'niiiiiiiiick': 'nick',\n",
       " 'niiiiick': 'nick',\n",
       " 'whooooooo': 'who',\n",
       " 'looooool': 'lol',\n",
       " 'cooooooooool': 'col',\n",
       " 'qaaaaqjudy': 'qaqjudy',\n",
       " 'aaaaaahhhhhhhhhhhhhhhaaaaaaaaaaaahhhhhhhh': 'ahah',\n",
       " 'loooooooool': 'lol',\n",
       " 'slowwwwwwwwwwwly': 'slowly',\n",
       " 'qvqqqq': 'qvq',\n",
       " 'sssssso': 'so',\n",
       " 'sssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss': 's',\n",
       " 'qwqqqqqq': 'qwq',\n",
       " 'orzzzzzz': 'orz',\n",
       " 'whhhhhh': 'wh',\n",
       " 'qaqqqqqqqq': 'qaq',\n",
       " 'qaqqqqqq': 'qaq',\n",
       " 'ummmmmm': 'um',\n",
       " 'ttttatttt': 'tat',\n",
       " 'nnnnnnnn': 'n',\n",
       " 'ohhhhhh': 'oh',\n",
       " 'fffffffff': 'f',\n",
       " 'aaaaaaaaaa': 'a',\n",
       " 'gooooooodddddddddd': 'god',\n",
       " 'quqqqq': 'quq',\n",
       " 'nononononononooooooo': 'nonononononono',\n",
       " 'orzzzzzzzzzzz': 'orz',\n",
       " 'booooom': 'bom',\n",
       " 'tttttttttttttttttttttttt': 't',\n",
       " 'rainbowdaaaaaaaaaaash': 'rainbowdash',\n",
       " 'coollllllll': 'cool',\n",
       " 'boooooooooom': 'bom',\n",
       " 'boooooooooooooom': 'bom',\n",
       " 'xxxx108': 'x108',\n",
       " 'shitttttt': 'shit',\n",
       " 'loooong': 'long',\n",
       " 'orzzzzz': 'orz',\n",
       " 'yaaaaaaaaa': 'ya',\n",
       " 'woooo': 'wo',\n",
       " 'qqqaqqqqqqq': 'qqqaq',\n",
       " 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz': 'z',\n",
       " 'nnnnnn': 'n',\n",
       " '66666666666yyyyy': '66666666666y',\n",
       " 'uuuuuuu': 'u',\n",
       " 'nnnnnnnnnnn': 'n',\n",
       " '23333yooooooo': '23333yo',\n",
       " 'wooooooooooooooooooooooooooooooooo': 'wo',\n",
       " 'goooooooooooood': 'god',\n",
       " 'pooooooooooooooooooomv': 'pomv',\n",
       " 'ttttattttttt': 'tat',\n",
       " 'ughhhhh': 'ugh',\n",
       " 'hhhhhhhhhhah': 'hah',\n",
       " 'ssssssssssuck': 'suck',\n",
       " 'toooo': 'to',\n",
       " 'maxxxxxxxxxxxxxx': 'max',\n",
       " 'ewwwwww': 'ew',\n",
       " 'xdddddddd': 'xd',\n",
       " 'yoooooooooooooooooooooooo': 'yo',\n",
       " 'ccccccccnm': 'cnm',\n",
       " 'oooooooooooooo': 'o',\n",
       " 'fxxxxxxxx': 'fx',\n",
       " 'kkkkkkkkkk': 'k',\n",
       " 'oppppsss': 'opsss',\n",
       " 'wocccc': 'woc',\n",
       " 'ddddddddddddddddddddddd': 'd',\n",
       " 'zerohhhh': 'zeroh',\n",
       " 'hennnnnnnnnnnnnn': 'hen',\n",
       " 'zdddd3': 'zd3',\n",
       " 'tvvvvvvt': 'tvt',\n",
       " 'adaaaa': 'ada',\n",
       " 'hhhhhand': 'hand',\n",
       " 'uuuuuee': 'uee',\n",
       " 'zzzzzzzzzzzz': 'z',\n",
       " 'toooooooooooooo': 'to',\n",
       " 'mmmmmmmvvvvvvvvvvv': 'mv',\n",
       " 'pshaishihaoxihuanaaaaaaaa': 'pshaishihaoxihuana',\n",
       " 'xzzzzzzz': 'xz',\n",
       " 'xxxxxxxd': 'xd',\n",
       " 'staaaaaaaaaaaaaaaaaaaaaaaaaaaaaark': 'stark',\n",
       " 'gooood': 'god',\n",
       " 'ewwww': 'ew',\n",
       " 'xdddddddddddddd': 'xd',\n",
       " 'llllllllllllllove': 'love',\n",
       " 'wwwwwwwwwwwwwwwwwwwwttttttttttttttttttttttttttttttffffffffffffffffffffffffffffffff': 'wtf',\n",
       " 'smashhhhhhhhhh': 'smash',\n",
       " 'yooooooooooooooooooooooooooooooooooooooo': 'yo',\n",
       " 'yoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo': 'yo',\n",
       " 'tooooooo': 'to',\n",
       " 'haaaaaaaah': 'hah',\n",
       " 'yooooooooooooooooooooooooooo': 'yo',\n",
       " 'cooooooooooooooooooool': 'col',\n",
       " 'goodddddddddddddddddd': 'good',\n",
       " 'yooooooho': 'yoho',\n",
       " 'ohhhhhhh': 'oh',\n",
       " 'coooooooooooooooooool': 'col',\n",
       " 'kneeeeeeeel': 'knel',\n",
       " 'finalllllly': 'finaly',\n",
       " 'lawwwwlz': 'lawlz',\n",
       " 'lokiiiii': 'loki',\n",
       " 'looooooooove': 'love',\n",
       " 'wowowoooo': 'wowowo',\n",
       " 'yooooooooooooooooooo': 'yo',\n",
       " 'mmmmm': 'm',\n",
       " 'huuuuuuuuuuuuuuuulk': 'hulk',\n",
       " 'yooooooooooooooooooooooo': 'yo',\n",
       " 'rrrrrgggg': 'rg',\n",
       " 'leeee': 'le',\n",
       " 'veeeery': 'very',\n",
       " 'ddddddd': 'd',\n",
       " 'mmmmmmmmmmm': 'm',\n",
       " 'tmmmmmmmm': 'tm',\n",
       " 'yooooooooooooooo': 'yo',\n",
       " 'smaaaaaash': 'smash',\n",
       " 'vvvvv': 'v',\n",
       " 'hyiiiii': 'hyi',\n",
       " 'hotttttttt': 'hot',\n",
       " 'arrrrrr': 'ar',\n",
       " 'mannnnn': 'man',\n",
       " 'lunnnnn': 'lun',\n",
       " 'xxxxxxxxxxxxd': 'xd',\n",
       " 'zzzzzzzzzzzzzzzzz': 'z',\n",
       " 'coolllll': 'cool',\n",
       " 'bullsssss': 'bulls',\n",
       " 'hhhhhhhhhhhhhh': 'h',\n",
       " 'oooops': 'ops',\n",
       " 'ohhhhh': 'oh',\n",
       " 'hhhhhhhhhhhhhhhhhhhhhhhhhh': 'h',\n",
       " 'fucccccccck': 'fuck',\n",
       " 'ouffff': 'ouf',\n",
       " 'noooo': 'no',\n",
       " 'tttttttttttt': 't',\n",
       " 'cheeeeeese': 'chese',\n",
       " 'coooooooooooooooool': 'col',\n",
       " 'errrr': 'er',\n",
       " 'hhhhaha': 'haha',\n",
       " 'hahahhhhahah': 'hahahahah',\n",
       " 'hahahahahhhhhhhhhhhhhhhhhh': 'hahahahah',\n",
       " 'funnnny': 'funy',\n",
       " '2bbbbbbbbbbbbb': '2b',\n",
       " 'cheeeeeeeeeeeeeeeeeeeeeeeeeeeeeeese': 'chese',\n",
       " 'saaaaaaaaaaaaaaaay': 'say',\n",
       " 'toooooooo': 'to',\n",
       " 'fffffffffffffff': 'f',\n",
       " 'qaaaaaaaaaaaaaaq': 'qaq',\n",
       " 'wwwwhat': 'what',\n",
       " 'ccccc': 'c',\n",
       " 'wahhhh': 'wah',\n",
       " 'waaaaaaaaay': 'way',\n",
       " 'prrrrrr': 'pr',\n",
       " 'pppppps': 'ps',\n",
       " 'loooooooove': 'love',\n",
       " 'yaaaassss': 'yas',\n",
       " 'bjjjjjjjjjj': 'bj',\n",
       " 'qqqqqqqqaqqqqqqqqq': 'qaq',\n",
       " 'uhmmmm': 'uhm',\n",
       " 'jezzzzzzz': 'jez',\n",
       " 'arghhhhhh': 'argh',\n",
       " 'pppp': 'p',\n",
       " 'cphhhhhhh': 'cph',\n",
       " 'dawadwdasssssssssssssssssssssssss': 'dawadwdas',\n",
       " 'sbbbbbbbbbbbbbbbbbbbbbbbbb': 'sb',\n",
       " 'bbbbbbbbbbbbbb': 'b',\n",
       " 'lolxxxxx': 'lolx',\n",
       " 'tooooooold': 'told',\n",
       " '3ddddddddddddddd': '3d',\n",
       " 'culttttttttttttttt': 'cult',\n",
       " 'ppppps': 'ps',\n",
       " 'manttttttttttttttttt': 'mant',\n",
       " 'aaaaaam': 'am',\n",
       " 'gooooooood': 'god',\n",
       " 'toooooooooooooooony': 'tony',\n",
       " 'xxxxxxxxxxxxxx': 'x',\n",
       " 'hoooooooh': 'hoh',\n",
       " 'odddddddddddddddddddddddd': 'od',\n",
       " 'gggggggggggggggggggggggggggggggggggggggg': 'g',\n",
       " 'boyhhhhhh': 'boyh',\n",
       " 'whaaaaaat': 'what',\n",
       " 'reallllly': 'realy',\n",
       " 'hhhhhha': 'ha',\n",
       " 'yooooh': 'yoh',\n",
       " 'yoooooooooooooooooooo': 'yo',\n",
       " 'yooooooooo': 'yo',\n",
       " 'qwwwwqqq': 'qwqqq',\n",
       " '3dhhhhh': '3dh',\n",
       " 'qwwwwwwq': 'qwq',\n",
       " 'whaaaaat': 'what',\n",
       " 'xxxxtop1': 'xtop1',\n",
       " 'cooooooooooool': 'col',\n",
       " 'maaaaxxxxxx': 'max',\n",
       " 'xddddddddddddd': 'xd',\n",
       " 'errrrrrrrrrrrrrrrrrrrrrrrrr': 'er',\n",
       " 'feeeeeeeel': 'fel',\n",
       " 'shiiiiiiiiiit': 'shit',\n",
       " 'omggggg': 'omg',\n",
       " 'whyyyyyyyyyyyyyy': 'why',\n",
       " 'shiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiit': 'shit',\n",
       " 'looooooooow': 'low',\n",
       " 'zzzzzzz': 'z',\n",
       " 'ygggffff': 'ygggf',\n",
       " 'alllllll': 'al',\n",
       " 'sooooocute': 'socute',\n",
       " '1vsnnnn': '1vsn',\n",
       " 'wwwttttfff': 'wwwtfff',\n",
       " 'yyyyyyyyyyyyyyyyyyyyyyyyy1': 'y1',\n",
       " 'haoooooo': 'hao',\n",
       " 'mannnnnn': 'man',\n",
       " 'noooooooooo': 'no',\n",
       " 'boyyyy': 'boy',\n",
       " 'sweettttttttttttttt': 'sweet',\n",
       " 'cuteeee': 'cute',\n",
       " 'coollllll': 'cool',\n",
       " 'fkkkkking': 'fking',\n",
       " 'kyaaaaa': 'kya',\n",
       " 'booooooooooooooooooooooooooooo': 'bo',\n",
       " 'tattttt': 'tat',\n",
       " 'allllllllll': 'al',\n",
       " 'bbbbbbbb': 'b',\n",
       " 'ummmm': 'um',\n",
       " 'zzzzzzzzzzzzzzz': 'z',\n",
       " 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz': 'z',\n",
       " 'wowowowwwwwwwww': 'wowowow',\n",
       " '3zzzzzzzzzz': '3z',\n",
       " 'zzzzzzzzzzzzzzzzzzzzzzz': 'z',\n",
       " 'cpwwww': 'cpw',\n",
       " 'xiaojjhhhhhh': 'xiaojjh',\n",
       " 'dddddd': 'd',\n",
       " 'cheeeeeeeeeeeeeeesy': 'chesy',\n",
       " 'eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee': 'e',\n",
       " 'yooooooooooo': 'yo',\n",
       " 'xxxxxxxxxxxxxxxxxxxxxx': 'x',\n",
       " 'lokiiiiiiiiii': 'loki',\n",
       " 'uhhhh': 'uh',\n",
       " 'taaaaaaaaat': 'tat',\n",
       " 'noooooooo': 'no',\n",
       " 'flaaaaaaat': 'flat',\n",
       " 'errrrrrrrr': 'er',\n",
       " 'bbbbbbbbbbbbbbbbbbbbbbb': 'b',\n",
       " 'zzzzzzzzzzzzzz': 'z',\n",
       " 'yayyyy': 'yay',\n",
       " 'booorriiiinnnnngggggg': 'booorring',\n",
       " 'shiiiiiiit': 'shit',\n",
       " 'gayyyyyyyyyyyyyyyyyyyyyyyyyyyy': 'gay',\n",
       " 'nooooooooo': 'no',\n",
       " 'boooooooooooring': 'boring',\n",
       " 'fuuuuuuuuuuck': 'fuck',\n",
       " 'toooooooooooooooo': 'to',\n",
       " 'zzzzzzzz': 'z',\n",
       " 'wowwwwwwwwwwwwwww': 'wow',\n",
       " 'whaaaaaaaaat': 'what',\n",
       " 'goooood': 'god',\n",
       " 'lannnnnnnnnnnnnnnnnnnnnn': 'lan',\n",
       " 'reaaaaaaaaaaaaaally': 'really',\n",
       " 'mdrrrrrrr': 'mdr',\n",
       " 'emmmmmm': 'em',\n",
       " 'looooser': 'loser',\n",
       " 'psssss': 'ps',\n",
       " 'ewwwwwwwwww': 'ew',\n",
       " 'pxxxx': 'px',\n",
       " 'ewwwww': 'ew',\n",
       " 'emmmmmmmmmm': 'em',\n",
       " 'ostostaaaa': 'ostosta',\n",
       " 'looooooooooooooong': 'long',\n",
       " 'emmmm': 'em',\n",
       " 'aaaaaaata': 'ata',\n",
       " 'wtffffffff': 'wtf',\n",
       " 'kooooooooooooooool': 'kol',\n",
       " 'nnnnnnnnnnnnnnnnnnn': 'n',\n",
       " 'aaaaaaaaaaaaaaaaah': 'ah',\n",
       " 'ahhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhotdwarvesatyourservice': 'ahotdwarvesatyourservice',\n",
       " 'wwwwwwwwwwwwwwwww': 'w',\n",
       " 'callinggggg': 'calling',\n",
       " 'toooooooooooooooold': 'told',\n",
       " 'farrrrrr': 'far',\n",
       " 'haaaaaaaaaaa': 'ha',\n",
       " 'hoooooooooo': 'ho',\n",
       " 'woooooo': 'wo',\n",
       " 'booooooooooring': 'boring',\n",
       " 'tttttttvttttttt': 'tvt',\n",
       " 'mooooooooonlight': 'monlight',\n",
       " 'qvvvvq': 'qvq',\n",
       " 'wwwwwwwwwww': 'w',\n",
       " 'wahhhhhhhhhh': 'wah',\n",
       " 'bbbbbbbbbbe': 'be',\n",
       " 'bbbbbbb': 'b',\n",
       " 'ttttvtttt': 'tvt',\n",
       " 'woooooooooooooooow': 'wow',\n",
       " 'mirrennnnnnnnnnnnnnnnnn': 'mirren',\n",
       " 'yeaaaap': 'yeap',\n",
       " 'grooooooooot': 'grot',\n",
       " 'shockkkkkk': 'shock',\n",
       " 'ttttttttttttttttttttttt': 't',\n",
       " 'daaaaaaaaa': 'da',\n",
       " 'ggggood': 'good',\n",
       " 'tttttttttt': 't',\n",
       " 'groooooooooooot': 'grot',\n",
       " 'toooooout': 'tout',\n",
       " 'woooooooooooooo': 'wo',\n",
       " 'sweeeeeeeeeeeeeet': 'swet',\n",
       " 'nooooo': 'no'}"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cleaned_unknown = {}\n",
    "i = 0\n",
    "for w, _ in unknown_words:\n",
    "    # 删除重复字母\n",
    "    pat1 = re.compile(r\"([a-z])(\\1{3,})\")\n",
    "    # 删除重复的字符串\n",
    "    pat2 = re.compile(r\"(\\w+)(\\1{2,}?)\")\n",
    "    t1 = pat1.sub(r\"\\1\", w)\n",
    "    t2 = pat1.sub(r\"\\1\", w)\n",
    "    if t1 != w:\n",
    "        cleaned_unknown[w] = t1\n",
    "    else:\n",
    "        if t2 != w:\n",
    "            cleaned_unknown[w] = t2\n",
    "\n",
    "cleaned_unknown"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 重新处理文本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T06:44:49.240752Z",
     "start_time": "2020-05-14T06:44:49.231770Z"
    }
   },
   "outputs": [],
   "source": [
    "class Tokenizer:\n",
    "    def __init__(self, vocab):\n",
    "        self.vocab = vocab\n",
    "        self.inv_vocab = {v: k for k, v in vocab.items()}\n",
    "\n",
    "    def tokenize(self, text):\n",
    "        text = self.simplify(text)\n",
    "\n",
    "        split_tokens = []\n",
    "        for token in jieba.cut(text):\n",
    "            if token in self.vocab:\n",
    "                split_tokens.append(token)\n",
    "            else:\n",
    "                if self._is_chinese(token) or self._is_string(token):\n",
    "                    split_tokens.extend(self.max_forward_cut(token))\n",
    "        return split_tokens\n",
    "\n",
    "    def max_forward_cut(self, chars):\n",
    "        sub_tokens = []\n",
    "        start = 0\n",
    "        while start < len(chars):\n",
    "            end = len(chars)\n",
    "            cur_substr = None\n",
    "            while start < end:\n",
    "                substr = chars[start:end]\n",
    "                if substr in self.vocab:\n",
    "                    cur_substr = substr\n",
    "                    break\n",
    "                end -= 1\n",
    "            if cur_substr is None:\n",
    "                start += 1\n",
    "                sub_tokens.append('unknown')\n",
    "            else:\n",
    "                if not sub_tokens or cur_substr != sub_tokens[-1]:\n",
    "                    sub_tokens.append(cur_substr)\n",
    "                start = end\n",
    "        return sub_tokens\n",
    "\n",
    "    def simplify(self, text):\n",
    "        return zhconv.convert(text, 'zh-cn')\n",
    "\n",
    "    def _is_string(self, str):\n",
    "        for s in str:\n",
    "            if s not in string.ascii_lowercase:\n",
    "                return False\n",
    "        return True\n",
    "\n",
    "    def _is_chinese(self, text):\n",
    "        for char in text:\n",
    "            if u'\\u4e00' <= char <= u'\\u9fff':\n",
    "                return True\n",
    "        return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T06:46:15.440185Z",
     "start_time": "2020-05-14T06:46:15.358978Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['12abc43', ',', '九百六十万平方公里', '。', 'pbocyq5ccfs', ' ', 'woow', '，', '哈哈哈', '哈哈哈', '喽']\n",
      "['九百', '六十万', '平方公里', 'woo', 'w', '哈哈哈', '哈哈哈', '喽']\n"
     ]
    }
   ],
   "source": [
    "tokenizer = Tokenizer(vocab=wv.vocab)\n",
    "\n",
    "text = '12abc43,九百六十万平方公里。pbocyq5ccfs woow，哈哈哈哈哈哈喽'\n",
    "print(list(jieba.cut(text)))\n",
    "print(tokenizer.tokenize(text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T06:52:20.341864Z",
     "start_time": "2020-05-14T06:47:13.305836Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/yangbin7/anaconda3/lib/python3.7/site-packages/tqdm/std.py:658: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
      "  from pandas import Panel\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "28b9b80eae084cae9d08c57bffb5e861",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='progress bar', max=2382890.0, style=ProgressStyle(descrip…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "from tqdm.auto import tqdm\n",
    "tqdm.pandas(desc=\"progress bar\")\n",
    "\n",
    "dataset['cleaned_comment'] = dataset['comment'].progress_apply(tokenizer.tokenize)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # 保存处理好的数据\n",
    "# with open('../datasets/douban_comments.txt', 'w') as f:\n",
    "#     for comment in dataset['cleaned_comment']:\n",
    "#         f.write(comment + '\\n')\n",
    "\n",
    "# with open('../datasets/douban_comment_stars.txt', 'w') as f:\n",
    "#     for star in dataset['star']:\n",
    "#         f.write(comment + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T06:58:40.338043Z",
     "start_time": "2020-05-14T06:58:34.276466Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "词汇表中 100.00% 的单词有词向量\n",
      "评论的所有单词中 100.00% 的单词有词向量\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def build_vocab(texts):\n",
    "    vocab = {}\n",
    "    for tokens in texts:\n",
    "        for word in tokens:\n",
    "            if word != 'unknown':\n",
    "                vocab[word] = vocab.get(word, 0) + 1\n",
    "    return vocab\n",
    "\n",
    "vocab = build_vocab(dataset['cleaned_comment'])\n",
    "\n",
    "check_coverage(vocab, wv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T06:59:16.615054Z",
     "start_time": "2020-05-14T06:59:16.559657Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'unknown': 0,\n",
       " '吴京': 1,\n",
       " '意淫': 2,\n",
       " '到': 3,\n",
       " '了': 4,\n",
       " '脑残': 5,\n",
       " '的': 6,\n",
       " '地步': 7,\n",
       " '看': 8,\n",
       " '恶心': 9,\n",
       " '想': 10,\n",
       " '吐': 11,\n",
       " '首映礼': 12,\n",
       " '太': 13,\n",
       " '恐怖': 14,\n",
       " '这个': 15,\n",
       " '电影': 16,\n",
       " '不讲道理': 17,\n",
       " '完全': 18,\n",
       " '就是': 19,\n",
       " '在': 20,\n",
       " '实现': 21,\n",
       " '他': 22,\n",
       " '小': 23,\n",
       " '粉红': 24,\n",
       " '英雄': 25,\n",
       " '梦': 26,\n",
       " '各种': 27,\n",
       " '装备': 28,\n",
       " '轮番': 29,\n",
       " '上场': 30,\n",
       " '视': 31,\n",
       " '物理': 32,\n",
       " '逻辑': 33,\n",
       " '于': 34,\n",
       " '不顾': 35,\n",
       " '不得不': 36,\n",
       " '说': 37,\n",
       " '有钱': 38,\n",
       " '真': 39,\n",
       " '好': 40,\n",
       " '随意': 41,\n",
       " '胡闹': 42,\n",
       " '炒作': 43,\n",
       " '水平': 44,\n",
       " '不输': 45,\n",
       " '冯小刚': 46,\n",
       " '但': 47,\n",
       " '小刚': 48,\n",
       " '至少': 49,\n",
       " '不会': 50,\n",
       " '用': 51,\n",
       " '主旋律': 52,\n",
       " '来': 53,\n",
       " '让': 54,\n",
       " '人': 55,\n",
       " '不': 56,\n",
       " '舒服': 57,\n",
       " '为了': 58,\n",
       " '而': 59,\n",
       " '煽情': 60,\n",
       " '觉得': 61,\n",
       " '是': 62,\n",
       " '个': 63,\n",
       " '大': 64,\n",
       " '做作': 65,\n",
       " '谎言': 66,\n",
       " '家': 67,\n",
       " '更新': 68,\n",
       " '片子': 69,\n",
       " '整体': 70,\n",
       " '不如': 71,\n",
       " '湄公河': 72,\n",
       " '行动': 73,\n",
       " '不够': 74,\n",
       " '流畅': 75,\n",
       " '编剧': 76,\n",
       " '有毒': 77,\n",
       " '台词': 78,\n",
       " '尴尬': 79,\n",
       " '刻意': 80,\n",
       " '显得': 81,\n",
       " '如此': 82,\n",
       " '不合时宜': 83,\n",
       " '又': 84,\n",
       " '多余': 85,\n",
       " '凭良心说': 86,\n",
       " '看到': 87,\n",
       " '不像': 88,\n",
       " '战狼': 89,\n",
       " '续集': 90,\n",
       " '完虐': 91,\n",
       " '中二': 92,\n",
       " '得': 93,\n",
       " '很': 94,\n",
       " '犯': 95,\n",
       " '我': 96,\n",
       " '中华': 97,\n",
       " '者': 98,\n",
       " '虽远': 99,\n",
       " '必': 100,\n",
       " '诛': 101,\n",
       " '比': 102,\n",
       " '这句': 103,\n",
       " '话': 104,\n",
       " '还要': 105,\n",
       " '一百倍': 106,\n",
       " '脑子': 107,\n",
       " '东西': 108,\n",
       " '希望': 109,\n",
       " '们': 110,\n",
       " '都': 111,\n",
       " '能': 112,\n",
       " '有': 113,\n",
       " '三星': 114,\n",
       " '半': 115,\n",
       " '实打实': 116,\n",
       " '分': 117,\n",
       " '第一集': 118,\n",
       " '爱国': 119,\n",
       " '内部': 120,\n",
       " '做': 121,\n",
       " '着': 122,\n",
       " '置换': 123,\n",
       " '与': 124,\n",
       " '较劲': 125,\n",
       " '第二集': 126,\n",
       " '才': 127,\n",
       " '真正': 128,\n",
       " '显露': 129,\n",
       " '野心': 130,\n",
       " '终于': 131,\n",
       " '抛弃': 132,\n",
       " '李忠志': 133,\n",
       " '新增': 134,\n",
       " '外来': 135,\n",
       " '班底': 136,\n",
       " '硬件': 137,\n",
       " '实力': 138,\n",
       " '机会': 139,\n",
       " '和': 140,\n",
       " '国际': 141,\n",
       " '接轨': 142,\n",
       " '开篇': 143,\n",
       " '水下': 144,\n",
       " '长镜头': 145,\n",
       " '诸如': 146,\n",
       " '铁丝网': 147,\n",
       " '拦截': 148,\n",
       " '弹头': 149,\n",
       " '细节': 150,\n",
       " '设计': 151,\n",
       " '国产': 152,\n",
       " '动作片': 153,\n",
       " '重新': 154,\n",
       " '封顶': 155,\n",
       " '理念': 156,\n",
       " '上': 157,\n",
       " '它': 158,\n",
       " '甚至': 159,\n",
       " '做到': 160,\n",
       " '绣春刀': 161,\n",
       " '最': 162,\n",
       " '那': 163,\n",
       " '部分': 164,\n",
       " '惊险': 165,\n",
       " '大气': 166,\n",
       " '引人入胜': 167,\n",
       " '结合': 168,\n",
       " '不俗': 169,\n",
       " '快': 170,\n",
       " '剪下': 171,\n",
       " '真刀真枪': 172,\n",
       " '不禁': 173,\n",
       " '热血沸腾': 174,\n",
       " '特别': 175,\n",
       " '弹簧床': 176,\n",
       " '架': 177,\n",
       " '挡': 178,\n",
       " '炸弹': 179,\n",
       " '空手': 180,\n",
       " '接': 181,\n",
       " '碎玻璃': 182,\n",
       " '弹匣': 183,\n",
       " '割喉': 184,\n",
       " '等': 185,\n",
       " '帅': 186,\n",
       " '飞起': 187,\n",
       " '就算': 188,\n",
       " '前半段': 189,\n",
       " '铺垫': 190,\n",
       " '节奏': 191,\n",
       " '散漫': 192,\n",
       " '主角': 193,\n",
       " '光环': 194,\n",
       " '开': 195,\n",
       " '太大': 196,\n",
       " '也': 197,\n",
       " '不怕': 198,\n",
       " '作为': 199,\n",
       " '一个': 200,\n",
       " '中国': 201,\n",
       " '两个': 202,\n",
       " '小时': 203,\n",
       " '弥漫着': 204,\n",
       " '强大': 205,\n",
       " '不可': 206,\n",
       " '侵犯': 207,\n",
       " '氛围': 208,\n",
       " '还是': 209,\n",
       " '那颗': 210,\n",
       " '民族': 211,\n",
       " '自豪': 212,\n",
       " '心': 213,\n",
       " '砰砰': 214,\n",
       " '砰': 215,\n",
       " '跳': 216,\n",
       " '不停': 217,\n",
       " '冷峰': 218,\n",
       " '这部': 219,\n",
       " '里': 220,\n",
       " '即': 221,\n",
       " '像': 222,\n",
       " '成龙': 223,\n",
       " '杰': 224,\n",
       " '森斯坦': 225,\n",
       " '森': 226,\n",
       " '体制': 227,\n",
       " '外': 228,\n",
       " '同': 229,\n",
       " '类型': 230,\n",
       " '总是': 231,\n",
       " '代表': 232,\n",
       " '个人': 233,\n",
       " '无能': 234,\n",
       " '政府': 235,\n",
       " '需要': 236,\n",
       " '求助于': 237,\n",
       " '这些': 238,\n",
       " '才能': 239,\n",
       " '解决': 240,\n",
       " '难题': 241,\n",
       " '体现': 242,\n",
       " '价值': 243,\n",
       " '所以': 244,\n",
       " '照抄': 245,\n",
       " '这种': 246,\n",
       " '模式': 247,\n",
       " '实际上': 248,\n",
       " '问题': 249,\n",
       " '我们': 250,\n",
       " '以前': 251,\n",
       " '嘲笑': 252,\n",
       " '英雄主义': 253,\n",
       " '却': 254,\n",
       " '没想到': 255,\n",
       " '捆绑': 256,\n",
       " '爱国主义': 257,\n",
       " '全能': 258,\n",
       " '战士': 259,\n",
       " '更加': 260,\n",
       " '难以': 261,\n",
       " '下咽': 262,\n",
       " '多': 263,\n",
       " '无脑': 264,\n",
       " '信': 265,\n",
       " '戏': 266,\n",
       " '对': 267,\n",
       " '路': 268,\n",
       " '转': 269,\n",
       " '粉': 270,\n",
       " '最后': 271,\n",
       " '彩蛋': 272,\n",
       " '没有': 273,\n",
       " '理由': 274,\n",
       " '期待': 275,\n",
       " '下': 276,\n",
       " '一部': 277,\n",
       " '假': 278,\n",
       " '嗨': 279,\n",
       " '几处': 280,\n",
       " '情节': 281,\n",
       " '设置': 282,\n",
       " '过于': 283,\n",
       " '彰显': 284,\n",
       " '国家': 285,\n",
       " '自豪感': 286,\n",
       " '稍显': 287,\n",
       " '突兀': 288,\n",
       " '爽': 289,\n",
       " '片': 290,\n",
       " '打戏': 291,\n",
       " '挺': 292,\n",
       " '燃': 293,\n",
       " '但是': 294,\n",
       " '故事': 295,\n",
       " '一般': 296,\n",
       " '达康': 297,\n",
       " '书记': 298,\n",
       " '合适': 299,\n",
       " '角色': 300,\n",
       " '赵': 301,\n",
       " '东来': 302,\n",
       " '倒': 303,\n",
       " '张瀚': 304,\n",
       " '太太': 305,\n",
       " '违': 306,\n",
       " '分钟': 307,\n",
       " '穿越': 308,\n",
       " '回': 309,\n",
       " '偶像剧': 310,\n",
       " '接到': 311,\n",
       " '非洲': 312,\n",
       " '卧底': 313,\n",
       " '冷锋': 314,\n",
       " '报告': 315,\n",
       " '丁义珍': 316,\n",
       " '现在': 317,\n",
       " '请求': 318,\n",
       " '抓捕': 319,\n",
       " '李达康': 320,\n",
       " '这件': 321,\n",
       " '事先': 322,\n",
       " '不要': 323,\n",
       " '声张': 324,\n",
       " '别': 325,\n",
       " '省厅': 326,\n",
       " '知道': 327,\n",
       " '就': 328,\n",
       " '你': 329,\n",
       " '一起': 330,\n",
       " '去': 331,\n",
       " '加上': 332,\n",
       " '同志': 333,\n",
       " '三人': 334,\n",
       " '逮捕': 335,\n",
       " '这次': 336,\n",
       " '行': 337,\n",
       " '叫': 338,\n",
       " '吧': 339,\n",
       " '拍': 340,\n",
       " '喜剧': 341,\n",
       " '整个': 342,\n",
       " '感觉': 343,\n",
       " '搞笑': 344,\n",
       " '这么': 345,\n",
       " '打': 346,\n",
       " '过': 347,\n",
       " '徐晓冬': 348,\n",
       " '么': 349,\n",
       " '往': 350,\n",
       " '一处': 351,\n",
       " '劲': 352,\n",
       " '使': 353,\n",
       " '梦想': 354,\n",
       " '看吧': 355,\n",
       " '第一部': 356,\n",
       " '好太多': 357,\n",
       " '谢谢': 358,\n",
       " '美': 359,\n",
       " '队': 360,\n",
       " '动作': 361,\n",
       " '指导': 362,\n",
       " '这': 363,\n",
       " '火': 364,\n",
       " '没见识': 365,\n",
       " '开头': 366,\n",
       " '长': 367,\n",
       " '对决': 368,\n",
       " '可算': 369,\n",
       " '华语': 370,\n",
       " '顶尖': 371,\n",
       " '存在': 372,\n",
       " '驱逐舰': 373,\n",
       " '导弹': 374,\n",
       " '坦克': 375,\n",
       " '商业片': 376,\n",
       " '狂': 377,\n",
       " '镜头': 378,\n",
       " '运用': 379,\n",
       " '笑': 380,\n",
       " '点': 381,\n",
       " '插入': 382,\n",
       " '好莱坞': 383,\n",
       " '爆米花': 384,\n",
       " '功': 385,\n",
       " '不过': 386,\n",
       " '从头': 387,\n",
       " '打到': 388,\n",
       " '尾': 389,\n",
       " '拼': 390,\n",
       " '虽然': 391,\n",
       " '有略': 392,\n",
       " '乱': 393,\n",
       " '时': 394,\n",
       " '因为': 395,\n",
       " '没': 396,\n",
       " '啥': 397,\n",
       " '期望值': 398,\n",
       " '被': 399,\n",
       " '吓了一跳': 400,\n",
       " '吴刚': 401,\n",
       " '谦和': 402,\n",
       " '丁海峰': 403,\n",
       " '老': 404,\n",
       " '三位': 405,\n",
       " '炖': 406,\n",
       " '烂熟': 407,\n",
       " '牛筋': 408,\n",
       " '嚼': 409,\n",
       " '用心': 410,\n",
       " '啊': 411,\n",
       " '导演': 412,\n",
       " '小看': 413,\n",
       " '确实': 414,\n",
       " '下功夫': 415,\n",
       " '拉': 416,\n",
       " '借鉴': 417,\n",
       " '至于': 418,\n",
       " '大家': 419,\n",
       " '比较': 420,\n",
       " '反感': 421,\n",
       " '情绪': 422,\n",
       " '那些': 423,\n",
       " '桥段': 424,\n",
       " '必备': 425,\n",
       " '稍微': 426,\n",
       " '一点': 427,\n",
       " '还': 428,\n",
       " '可以': 429,\n",
       " '接受': 430,\n",
       " '最好': 431,\n",
       " '地方': 432,\n",
       " '掌握': 433,\n",
       " '张弛': 434,\n",
       " '有度': 435,\n",
       " '这点': 436,\n",
       " '难得': 437,\n",
       " '一直': 438,\n",
       " '脑子里': 439,\n",
       " '回响': 440,\n",
       " '片头': 441,\n",
       " '海里': 442,\n",
       " '那场': 443,\n",
       " '完': 444,\n",
       " '呆': 445,\n",
       " '下去': 446,\n",
       " '太假': 447,\n",
       " '提前': 448,\n",
       " '离场': 449,\n",
       " '好看': 450,\n",
       " '演技': 451,\n",
       " '棒': 452,\n",
       " '符合': 453,\n",
       " '反而': 454,\n",
       " '更': 455,\n",
       " '差': 456,\n",
       " '这一': 457,\n",
       " '放之四海而皆准': 458,\n",
       " '规律': 459,\n",
       " '场面': 460,\n",
       " '越做越': 461,\n",
       " '然而': 462,\n",
       " '伴随': 463,\n",
       " '特效': 464,\n",
       " '升级': 465,\n",
       " '叙事': 466,\n",
       " '变得': 467,\n",
       " '非常': 468,\n",
       " '凌乱': 469,\n",
       " '格局': 470,\n",
       " '颇': 471,\n",
       " '拍成': 472,\n",
       " '黑鹰坠落': 473,\n",
       " '结果': 474,\n",
       " '撑': 475,\n",
       " '死': 476,\n",
       " '最多': 477,\n",
       " '只是': 478,\n",
       " '官方': 479,\n",
       " '版': 480,\n",
       " '敢死队': 481,\n",
       " '但论': 482,\n",
       " '自我': 483,\n",
       " '角色定位': 484,\n",
       " '能力': 485,\n",
       " '远': 486,\n",
       " '如同': 487,\n",
       " '演员': 488,\n",
       " '出身': 489,\n",
       " '甄子丹': 490,\n",
       " '喜欢': 491,\n",
       " '不是': 492,\n",
       " '装傻': 493,\n",
       " '真傻': 494,\n",
       " '要不是': 495,\n",
       " '真的': 496,\n",
       " '别的': 497,\n",
       " '可': 498,\n",
       " '肯定': 499,\n",
       " '选': 500,\n",
       " '直': 501,\n",
       " '男': 502,\n",
       " '癌': 503,\n",
       " '令人发指': 504,\n",
       " '所有': 505,\n",
       " '剧情': 506,\n",
       " '走向': 507,\n",
       " '九十年代': 508,\n",
       " '那套': 509,\n",
       " '照搬': 510,\n",
       " '审美': 511,\n",
       " '事儿': 512,\n",
       " '一时': 513,\n",
       " '会儿': 514,\n",
       " '培养': 515,\n",
       " '出来': 516,\n",
       " '整部': 517,\n",
       " '延续': 518,\n",
       " '风格': 519,\n",
       " '热血': 520,\n",
       " '要': 521,\n",
       " '不错': 522,\n",
       " '适合': 523,\n",
       " '演': 524,\n",
       " '军人': 525,\n",
       " '之前': 526,\n",
       " '片段': 527,\n",
       " '念': 528,\n",
       " '劲儿': 529,\n",
       " '来说': 530,\n",
       " '张翰': 531,\n",
       " '一': 532,\n",
       " '一股': 533,\n",
       " '雷阵雨': 534,\n",
       " '画风': 535,\n",
       " '目': 536,\n",
       " '瞪': 537,\n",
       " '狗': 538,\n",
       " '瘠薄': 539,\n",
       " '人牛': 540,\n",
       " 'b': 541,\n",
       " '硬道理': 542,\n",
       " '隔壁': 543,\n",
       " '建军': 544,\n",
       " '大爷': 545,\n",
       " '你们': 546,\n",
       " '场景': 547,\n",
       " '战斗': 548,\n",
       " '全线': 549,\n",
       " '打斗': 550,\n",
       " '游走': 551,\n",
       " '审查': 552,\n",
       " '红线': 553,\n",
       " '边界': 554,\n",
       " '政治': 555,\n",
       " '安全': 556,\n",
       " '缝隙': 557,\n",
       " '部': 558,\n",
       " '极具': 559,\n",
       " '煽动': 560,\n",
       " '大片': 561,\n",
       " '制作': 562,\n",
       " '精良': 563,\n",
       " '影片': 564,\n",
       " '请': 565,\n",
       " '多来': 566,\n",
       " '胶卷': 567,\n",
       " '过度': 568,\n",
       " '部队': 569,\n",
       " '没太多': 570,\n",
       " '展示': 571,\n",
       " '死去': 572,\n",
       " '反正': 573,\n",
       " '吸引': 574,\n",
       " '冲': 575,\n",
       " '为什么': 576,\n",
       " '鄙视': 577,\n",
       " '敢': 578,\n",
       " '开拓': 579,\n",
       " '允许': 580,\n",
       " '他们': 581,\n",
       " '再': 582,\n",
       " '直到': 583,\n",
       " '更好': 584,\n",
       " '拍出': 585,\n",
       " '出彩': 586,\n",
       " '呢': 587,\n",
       " '火爆': 588,\n",
       " '本片': 589,\n",
       " '必将': 590,\n",
       " '燃爆': 591,\n",
       " '暑期': 592,\n",
       " '厉害': 593,\n",
       " '身为': 594,\n",
       " '武打': 595,\n",
       " '高标准': 596,\n",
       " '枪战': 597,\n",
       " '为': 598,\n",
       " '点赞': 599,\n",
       " '热血男儿': 600,\n",
       " '荷尔蒙': 601,\n",
       " '爆发': 602,\n",
       " '给': 603,\n",
       " '星': 604,\n",
       " '血战': 605,\n",
       " '钢锯': 606,\n",
       " '岭': 607,\n",
       " '会': 608,\n",
       " '歌颂': 609,\n",
       " '宗教': 610,\n",
       " '情怀': 611,\n",
       " '超越': 612,\n",
       " '政权': 613,\n",
       " '当': 614,\n",
       " '只': 615,\n",
       " '明显': 616,\n",
       " '低': 617,\n",
       " '层次': 618,\n",
       " '充满': 619,\n",
       " '现实': 620,\n",
       " '乃至': 621,\n",
       " '投机': 622,\n",
       " '考量': 623,\n",
       " '高下': 624,\n",
       " '立': 625,\n",
       " '见': 626,\n",
       " '请问': 627,\n",
       " '脑': 628,\n",
       " '残': 629,\n",
       " '火箭炮': 630,\n",
       " '吗': 631,\n",
       " '傲气': 632,\n",
       " '雄鹰': 633,\n",
       " '第一': 634,\n",
       " '滴血': 635,\n",
       " '算是': 636,\n",
       " '国内': 637,\n",
       " '准': 638,\n",
       " '钱': 639,\n",
       " '花': 640,\n",
       " '有效': 641,\n",
       " '气魄': 642,\n",
       " '创作': 643,\n",
       " '足够': 644,\n",
       " '真诚': 645,\n",
       " '人物': 646,\n",
       " '连': 647,\n",
       " '可爱': 648,\n",
       " '如果': 649,\n",
       " '当年': 650,\n",
       " '那样': 651,\n",
       " '膨胀': 652,\n",
       " '银幕': 653,\n",
       " '独占': 654,\n",
       " '聚光灯': 655,\n",
       " '走': 656,\n",
       " '扪心自问': 657,\n",
       " '没法': 658,\n",
       " '评价': 659,\n",
       " '全片': 660,\n",
       " '靠': 661,\n",
       " '文戏': 662,\n",
       " '扯淡': 663,\n",
       " '女主角': 664,\n",
       " '毫无': 665,\n",
       " '必要': 666,\n",
       " '只要': 667,\n",
       " '开挂': 668,\n",
       " '牛': 669,\n",
       " '逼': 670,\n",
       " '之处': 671,\n",
       " '在于': 672,\n",
       " '透露': 673,\n",
       " '极': 674,\n",
       " '强烈': 675,\n",
       " '意识形态': 676,\n",
       " '枷锁': 677,\n",
       " '祖国': 678,\n",
       " '面前': 679,\n",
       " '一切': 680,\n",
       " '反动派': 681,\n",
       " '纸老虎': 682,\n",
       " '人开': 683,\n",
       " '挂': 684,\n",
       " '团灭': 685,\n",
       " '合情合理': 686,\n",
       " '两星': 687,\n",
       " '鼓励': 688,\n",
       " '其他': 689,\n",
       " '般': 690,\n",
       " '看点': 691,\n",
       " '有点': 692,\n",
       " '手接': 693,\n",
       " '哈哈哈': 694,\n",
       " '从': 695,\n",
       " '之后': 696,\n",
       " '炸': 697,\n",
       " '翻': 698,\n",
       " '一下': 699,\n",
       " '四星': 700,\n",
       " '当时': 701,\n",
       " '其实': 702,\n",
       " '完成度': 703,\n",
       " '接近': 704,\n",
       " '每个': 705,\n",
       " '步骤': 706,\n",
       " '顺滑': 707,\n",
       " '任何': 708,\n",
       " '出人意料': 709,\n",
       " '是因为': 710,\n",
       " '看看': 711,\n",
       " '最近': 712,\n",
       " '世界': 713,\n",
       " '抱歉': 714,\n",
       " '影院': 715,\n",
       " '起来': 716,\n",
       " '魔幻': 717,\n",
       " '当然': 718,\n",
       " '强拆': 719,\n",
       " '现实感': 720,\n",
       " '一幕': 721,\n",
       " '开场': 722,\n",
       " '搏斗': 723,\n",
       " '从来': 724,\n",
       " '其它': 725,\n",
       " '拍摄': 726,\n",
       " '难度': 727,\n",
       " '同时': 728,\n",
       " '技能': 729,\n",
       " '方面': 730,\n",
       " '要求': 731,\n",
       " '回来': 732,\n",
       " '搜': 733,\n",
       " '游泳': 734,\n",
       " '潜水': 735,\n",
       " '滑雪': 736,\n",
       " '飞机': 737,\n",
       " '射击': 738,\n",
       " '各项': 739,\n",
       " '特意': 740,\n",
       " '特种部队': 741,\n",
       " '当过': 742,\n",
       " '月': 743,\n",
       " '兵': 744,\n",
       " '佩服': 745,\n",
       " '这样': 746,\n",
       " '星半': 747,\n",
       " '结束': 748,\n",
       " '掌声': 749,\n",
       " '出现': 750,\n",
       " '近期': 751,\n",
       " '少见': 752,\n",
       " '一粒': 753,\n",
       " '大补丸': 754,\n",
       " '有人': 755,\n",
       " '吃': 756,\n",
       " '开心': 757,\n",
       " '补大': 758,\n",
       " '从白': 759,\n",
       " '黑': 760,\n",
       " '字幕': 761,\n",
       " '展现': 762,\n",
       " '超级': 763,\n",
       " '糙': 764,\n",
       " '猛': 765,\n",
       " '媲美': 766,\n",
       " '终结者': 767,\n",
       " '无': 768,\n",
       " '亮点': 769,\n",
       " '变': 770,\n",
       " '谐星': 771,\n",
       " '掌控': 772,\n",
       " '逼近': 773,\n",
       " '不住': 774,\n",
       " '边缘': 775,\n",
       " '带': 776,\n",
       " '感': 777,\n",
       " '拳拳': 778,\n",
       " '肉': 779,\n",
       " '超爽': 780,\n",
       " '聪明': 781,\n",
       " '鸡': 782,\n",
       " '贼': 783,\n",
       " '一面': 784,\n",
       " '旗下': 785,\n",
       " '呈现': 786,\n",
       " '一出': 787,\n",
       " '重工业': 788,\n",
       " '娱乐': 789,\n",
       " '调控': 790,\n",
       " '说教': 791,\n",
       " '比例': 792,\n",
       " '尺度': 793,\n",
       " '大众': 794,\n",
       " '接纳': 795,\n",
       " '把握': 796,\n",
       " '微妙': 797,\n",
       " '其中': 798,\n",
       " '一些': 799,\n",
       " '奇侠': 800,\n",
       " '化': 801,\n",
       " '内容': 802,\n",
       " '比如': 803,\n",
       " '玻璃': 804,\n",
       " '碴': 805,\n",
       " '子': 806,\n",
       " '飞镖': 807,\n",
       " '杀敌': 808,\n",
       " '一类': 809,\n",
       " '只不过': 810,\n",
       " '遮盖': 811,\n",
       " '掉': 812,\n",
       " '老爹': 813,\n",
       " '演过': 814,\n",
       " '美剧': 815,\n",
       " '搏击': 816,\n",
       " '王国': 817,\n",
       " '力荐': 818,\n",
       " '那部': 819,\n",
       " '为啥': 820,\n",
       " '奇异': 821,\n",
       " '恩典': 822,\n",
       " '配乐': 823,\n",
       " '画': 824,\n",
       " '内': 825,\n",
       " '男生': 826,\n",
       " '的话': 827,\n",
       " '应该': 828,\n",
       " '刺激': 829,\n",
       " '肾上腺素': 830,\n",
       " '女生': 831,\n",
       " '对龙': 832,\n",
       " '小云': 833,\n",
       " '感情': 834,\n",
       " '十分': 835,\n",
       " '打动': 836,\n",
       " '模仿': 837,\n",
       " '许多': 838,\n",
       " '怎么': 839,\n",
       " '玩': 840,\n",
       " '一股脑': 841,\n",
       " '堆': 842,\n",
       " '槽': 843,\n",
       " '几位': 844,\n",
       " '血': 845,\n",
       " '厚到': 846,\n",
       " '科幻': 847,\n",
       " '级别': 848,\n",
       " '重复': 849,\n",
       " '满血': 850,\n",
       " '红血': 851,\n",
       " '中毒': 852,\n",
       " '极速': 853,\n",
       " '回血': 854,\n",
       " '爆': 855,\n",
       " '种': 856,\n",
       " '打通': 857,\n",
       " '全场': 858,\n",
       " '太过': 859,\n",
       " '投机取巧': 860,\n",
       " '穿': 861,\n",
       " '迈克尔': 862,\n",
       " '贝都': 863,\n",
       " '不受': 864,\n",
       " '待见': 865,\n",
       " '国片': 866,\n",
       " '前仆后继': 867,\n",
       " '爆炸': 868,\n",
       " 'high': 869,\n",
       " '瞎': 870,\n",
       " '没用': 871,\n",
       " '女人': 872,\n",
       " '缺': 873,\n",
       " '男人': 874,\n",
       " '征服': 875,\n",
       " '美国': 876,\n",
       " '不行': 877,\n",
       " '全都': 878,\n",
       " '跟': 879,\n",
       " '跳墙': 880,\n",
       " '一样': 881,\n",
       " '拯救': 882,\n",
       " '国产片': 883,\n",
       " '以': 884,\n",
       " '中印': 885,\n",
       " '局势': 886,\n",
       " '对比': 887,\n",
       " '假想': 888,\n",
       " '真是': 889,\n",
       " '讽刺': 890,\n",
       " '谄媚': 891,\n",
       " '军旅': 892,\n",
       " '题材': 893,\n",
       " '质感': 894,\n",
       " '国外': 895,\n",
       " '精彩': 896,\n",
       " '看着': 897,\n",
       " '有力': 898,\n",
       " '必须': 899,\n",
       " '安利': 900,\n",
       " '张': 901,\n",
       " '翰': 902,\n",
       " '简直': 903,\n",
       " '承包': 904,\n",
       " '笑点': 905,\n",
       " '量身定做': 906,\n",
       " '彭于': 907,\n",
       " '晏': 908,\n",
       " '可演': 909,\n",
       " '不来': 910,\n",
       " '不少': 911,\n",
       " '漂移': 912,\n",
       " '无人机': 913,\n",
       " '突袭': 914,\n",
       " '直升机': 915,\n",
       " '坠': 916,\n",
       " '露': 917,\n",
       " '肉搏': 918,\n",
       " '军舰': 919,\n",
       " '发射': 920,\n",
       " '叛乱': 921,\n",
       " '国际化': 922,\n",
       " '视角': 923,\n",
       " '标配': 924,\n",
       " '饰演': 925,\n",
       " '深入人心': 926,\n",
       " '搏命': 927,\n",
       " '精神': 928,\n",
       " '当下': 929,\n",
       " '第三部': 930,\n",
       " '表白': 931,\n",
       " '典型': 932,\n",
       " '方式': 933,\n",
       " '每次': 934,\n",
       " '猜': 935,\n",
       " '诶': 936,\n",
       " '问': 937,\n",
       " '王牌': 938,\n",
       " '特工': 939,\n",
       " '那么': 940,\n",
       " '杀人': 941,\n",
       " '经过': 942,\n",
       " '艺术': 943,\n",
       " '处理': 944,\n",
       " '直接': 945,\n",
       " '删': 946,\n",
       " '血腥': 947,\n",
       " '屠杀': 948,\n",
       " '赤裸裸': 949,\n",
       " '大段': 950,\n",
       " '正确': 951,\n",
       " '庇': 952,\n",
       " '衣': 953,\n",
       " '意料之中': 954,\n",
       " '意料之外': 955,\n",
       " '惊喜': 956,\n",
       " '属于': 957,\n",
       " '狼性': 958,\n",
       " '军魂': 959,\n",
       " '几个': 960,\n",
       " '网红': 961,\n",
       " '弹弹琴': 962,\n",
       " '大国': 963,\n",
       " '气象': 964,\n",
       " '满屏': 965,\n",
       " '告诉': 966,\n",
       " '吴': 967,\n",
       " '迪塞尔': 968,\n",
       " '如入无人之境': 969,\n",
       " '亿': 970,\n",
       " '大陆': 971,\n",
       " '一刻': 972,\n",
       " '集体': 973,\n",
       " '勃起': 974,\n",
       " '离开': 975,\n",
       " '影厅': 976,\n",
       " '屌丝': 977,\n",
       " '同样': 978,\n",
       " '开始': 979,\n",
       " '前': 980,\n",
       " '屌': 981,\n",
       " '一万倍': 982,\n",
       " '一次': 983,\n",
       " '标准': 984,\n",
       " '打造': 985,\n",
       " '美式': 986,\n",
       " '不可逆转': 987,\n",
       " '缺点': 988,\n",
       " '笑料': 989,\n",
       " '一定': 990,\n",
       " '程度': 991,\n",
       " '地': 992,\n",
       " '破坏': 993,\n",
       " '节奏感': 994,\n",
       " '斥': 995,\n",
       " '巨资': 996,\n",
       " '炮制': 997,\n",
       " '有所': 998,\n",
       " '体验': 999,\n",
       " ...}"
      ]
     },
     "execution_count": 154,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word2index = {'unknown': 0}\n",
    "for word, _ in vocab.items():\n",
    "    word2index[word] = len(word2index)\n",
    "word2index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T06:59:24.691025Z",
     "start_time": "2020-05-14T06:59:24.654600Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: 'unknown',\n",
       " 1: '吴京',\n",
       " 2: '意淫',\n",
       " 3: '到',\n",
       " 4: '了',\n",
       " 5: '脑残',\n",
       " 6: '的',\n",
       " 7: '地步',\n",
       " 8: '看',\n",
       " 9: '恶心',\n",
       " 10: '想',\n",
       " 11: '吐',\n",
       " 12: '首映礼',\n",
       " 13: '太',\n",
       " 14: '恐怖',\n",
       " 15: '这个',\n",
       " 16: '电影',\n",
       " 17: '不讲道理',\n",
       " 18: '完全',\n",
       " 19: '就是',\n",
       " 20: '在',\n",
       " 21: '实现',\n",
       " 22: '他',\n",
       " 23: '小',\n",
       " 24: '粉红',\n",
       " 25: '英雄',\n",
       " 26: '梦',\n",
       " 27: '各种',\n",
       " 28: '装备',\n",
       " 29: '轮番',\n",
       " 30: '上场',\n",
       " 31: '视',\n",
       " 32: '物理',\n",
       " 33: '逻辑',\n",
       " 34: '于',\n",
       " 35: '不顾',\n",
       " 36: '不得不',\n",
       " 37: '说',\n",
       " 38: '有钱',\n",
       " 39: '真',\n",
       " 40: '好',\n",
       " 41: '随意',\n",
       " 42: '胡闹',\n",
       " 43: '炒作',\n",
       " 44: '水平',\n",
       " 45: '不输',\n",
       " 46: '冯小刚',\n",
       " 47: '但',\n",
       " 48: '小刚',\n",
       " 49: '至少',\n",
       " 50: '不会',\n",
       " 51: '用',\n",
       " 52: '主旋律',\n",
       " 53: '来',\n",
       " 54: '让',\n",
       " 55: '人',\n",
       " 56: '不',\n",
       " 57: '舒服',\n",
       " 58: '为了',\n",
       " 59: '而',\n",
       " 60: '煽情',\n",
       " 61: '觉得',\n",
       " 62: '是',\n",
       " 63: '个',\n",
       " 64: '大',\n",
       " 65: '做作',\n",
       " 66: '谎言',\n",
       " 67: '家',\n",
       " 68: '更新',\n",
       " 69: '片子',\n",
       " 70: '整体',\n",
       " 71: '不如',\n",
       " 72: '湄公河',\n",
       " 73: '行动',\n",
       " 74: '不够',\n",
       " 75: '流畅',\n",
       " 76: '编剧',\n",
       " 77: '有毒',\n",
       " 78: '台词',\n",
       " 79: '尴尬',\n",
       " 80: '刻意',\n",
       " 81: '显得',\n",
       " 82: '如此',\n",
       " 83: '不合时宜',\n",
       " 84: '又',\n",
       " 85: '多余',\n",
       " 86: '凭良心说',\n",
       " 87: '看到',\n",
       " 88: '不像',\n",
       " 89: '战狼',\n",
       " 90: '续集',\n",
       " 91: '完虐',\n",
       " 92: '中二',\n",
       " 93: '得',\n",
       " 94: '很',\n",
       " 95: '犯',\n",
       " 96: '我',\n",
       " 97: '中华',\n",
       " 98: '者',\n",
       " 99: '虽远',\n",
       " 100: '必',\n",
       " 101: '诛',\n",
       " 102: '比',\n",
       " 103: '这句',\n",
       " 104: '话',\n",
       " 105: '还要',\n",
       " 106: '一百倍',\n",
       " 107: '脑子',\n",
       " 108: '东西',\n",
       " 109: '希望',\n",
       " 110: '们',\n",
       " 111: '都',\n",
       " 112: '能',\n",
       " 113: '有',\n",
       " 114: '三星',\n",
       " 115: '半',\n",
       " 116: '实打实',\n",
       " 117: '分',\n",
       " 118: '第一集',\n",
       " 119: '爱国',\n",
       " 120: '内部',\n",
       " 121: '做',\n",
       " 122: '着',\n",
       " 123: '置换',\n",
       " 124: '与',\n",
       " 125: '较劲',\n",
       " 126: '第二集',\n",
       " 127: '才',\n",
       " 128: '真正',\n",
       " 129: '显露',\n",
       " 130: '野心',\n",
       " 131: '终于',\n",
       " 132: '抛弃',\n",
       " 133: '李忠志',\n",
       " 134: '新增',\n",
       " 135: '外来',\n",
       " 136: '班底',\n",
       " 137: '硬件',\n",
       " 138: '实力',\n",
       " 139: '机会',\n",
       " 140: '和',\n",
       " 141: '国际',\n",
       " 142: '接轨',\n",
       " 143: '开篇',\n",
       " 144: '水下',\n",
       " 145: '长镜头',\n",
       " 146: '诸如',\n",
       " 147: '铁丝网',\n",
       " 148: '拦截',\n",
       " 149: '弹头',\n",
       " 150: '细节',\n",
       " 151: '设计',\n",
       " 152: '国产',\n",
       " 153: '动作片',\n",
       " 154: '重新',\n",
       " 155: '封顶',\n",
       " 156: '理念',\n",
       " 157: '上',\n",
       " 158: '它',\n",
       " 159: '甚至',\n",
       " 160: '做到',\n",
       " 161: '绣春刀',\n",
       " 162: '最',\n",
       " 163: '那',\n",
       " 164: '部分',\n",
       " 165: '惊险',\n",
       " 166: '大气',\n",
       " 167: '引人入胜',\n",
       " 168: '结合',\n",
       " 169: '不俗',\n",
       " 170: '快',\n",
       " 171: '剪下',\n",
       " 172: '真刀真枪',\n",
       " 173: '不禁',\n",
       " 174: '热血沸腾',\n",
       " 175: '特别',\n",
       " 176: '弹簧床',\n",
       " 177: '架',\n",
       " 178: '挡',\n",
       " 179: '炸弹',\n",
       " 180: '空手',\n",
       " 181: '接',\n",
       " 182: '碎玻璃',\n",
       " 183: '弹匣',\n",
       " 184: '割喉',\n",
       " 185: '等',\n",
       " 186: '帅',\n",
       " 187: '飞起',\n",
       " 188: '就算',\n",
       " 189: '前半段',\n",
       " 190: '铺垫',\n",
       " 191: '节奏',\n",
       " 192: '散漫',\n",
       " 193: '主角',\n",
       " 194: '光环',\n",
       " 195: '开',\n",
       " 196: '太大',\n",
       " 197: '也',\n",
       " 198: '不怕',\n",
       " 199: '作为',\n",
       " 200: '一个',\n",
       " 201: '中国',\n",
       " 202: '两个',\n",
       " 203: '小时',\n",
       " 204: '弥漫着',\n",
       " 205: '强大',\n",
       " 206: '不可',\n",
       " 207: '侵犯',\n",
       " 208: '氛围',\n",
       " 209: '还是',\n",
       " 210: '那颗',\n",
       " 211: '民族',\n",
       " 212: '自豪',\n",
       " 213: '心',\n",
       " 214: '砰砰',\n",
       " 215: '砰',\n",
       " 216: '跳',\n",
       " 217: '不停',\n",
       " 218: '冷峰',\n",
       " 219: '这部',\n",
       " 220: '里',\n",
       " 221: '即',\n",
       " 222: '像',\n",
       " 223: '成龙',\n",
       " 224: '杰',\n",
       " 225: '森斯坦',\n",
       " 226: '森',\n",
       " 227: '体制',\n",
       " 228: '外',\n",
       " 229: '同',\n",
       " 230: '类型',\n",
       " 231: '总是',\n",
       " 232: '代表',\n",
       " 233: '个人',\n",
       " 234: '无能',\n",
       " 235: '政府',\n",
       " 236: '需要',\n",
       " 237: '求助于',\n",
       " 238: '这些',\n",
       " 239: '才能',\n",
       " 240: '解决',\n",
       " 241: '难题',\n",
       " 242: '体现',\n",
       " 243: '价值',\n",
       " 244: '所以',\n",
       " 245: '照抄',\n",
       " 246: '这种',\n",
       " 247: '模式',\n",
       " 248: '实际上',\n",
       " 249: '问题',\n",
       " 250: '我们',\n",
       " 251: '以前',\n",
       " 252: '嘲笑',\n",
       " 253: '英雄主义',\n",
       " 254: '却',\n",
       " 255: '没想到',\n",
       " 256: '捆绑',\n",
       " 257: '爱国主义',\n",
       " 258: '全能',\n",
       " 259: '战士',\n",
       " 260: '更加',\n",
       " 261: '难以',\n",
       " 262: '下咽',\n",
       " 263: '多',\n",
       " 264: '无脑',\n",
       " 265: '信',\n",
       " 266: '戏',\n",
       " 267: '对',\n",
       " 268: '路',\n",
       " 269: '转',\n",
       " 270: '粉',\n",
       " 271: '最后',\n",
       " 272: '彩蛋',\n",
       " 273: '没有',\n",
       " 274: '理由',\n",
       " 275: '期待',\n",
       " 276: '下',\n",
       " 277: '一部',\n",
       " 278: '假',\n",
       " 279: '嗨',\n",
       " 280: '几处',\n",
       " 281: '情节',\n",
       " 282: '设置',\n",
       " 283: '过于',\n",
       " 284: '彰显',\n",
       " 285: '国家',\n",
       " 286: '自豪感',\n",
       " 287: '稍显',\n",
       " 288: '突兀',\n",
       " 289: '爽',\n",
       " 290: '片',\n",
       " 291: '打戏',\n",
       " 292: '挺',\n",
       " 293: '燃',\n",
       " 294: '但是',\n",
       " 295: '故事',\n",
       " 296: '一般',\n",
       " 297: '达康',\n",
       " 298: '书记',\n",
       " 299: '合适',\n",
       " 300: '角色',\n",
       " 301: '赵',\n",
       " 302: '东来',\n",
       " 303: '倒',\n",
       " 304: '张瀚',\n",
       " 305: '太太',\n",
       " 306: '违',\n",
       " 307: '分钟',\n",
       " 308: '穿越',\n",
       " 309: '回',\n",
       " 310: '偶像剧',\n",
       " 311: '接到',\n",
       " 312: '非洲',\n",
       " 313: '卧底',\n",
       " 314: '冷锋',\n",
       " 315: '报告',\n",
       " 316: '丁义珍',\n",
       " 317: '现在',\n",
       " 318: '请求',\n",
       " 319: '抓捕',\n",
       " 320: '李达康',\n",
       " 321: '这件',\n",
       " 322: '事先',\n",
       " 323: '不要',\n",
       " 324: '声张',\n",
       " 325: '别',\n",
       " 326: '省厅',\n",
       " 327: '知道',\n",
       " 328: '就',\n",
       " 329: '你',\n",
       " 330: '一起',\n",
       " 331: '去',\n",
       " 332: '加上',\n",
       " 333: '同志',\n",
       " 334: '三人',\n",
       " 335: '逮捕',\n",
       " 336: '这次',\n",
       " 337: '行',\n",
       " 338: '叫',\n",
       " 339: '吧',\n",
       " 340: '拍',\n",
       " 341: '喜剧',\n",
       " 342: '整个',\n",
       " 343: '感觉',\n",
       " 344: '搞笑',\n",
       " 345: '这么',\n",
       " 346: '打',\n",
       " 347: '过',\n",
       " 348: '徐晓冬',\n",
       " 349: '么',\n",
       " 350: '往',\n",
       " 351: '一处',\n",
       " 352: '劲',\n",
       " 353: '使',\n",
       " 354: '梦想',\n",
       " 355: '看吧',\n",
       " 356: '第一部',\n",
       " 357: '好太多',\n",
       " 358: '谢谢',\n",
       " 359: '美',\n",
       " 360: '队',\n",
       " 361: '动作',\n",
       " 362: '指导',\n",
       " 363: '这',\n",
       " 364: '火',\n",
       " 365: '没见识',\n",
       " 366: '开头',\n",
       " 367: '长',\n",
       " 368: '对决',\n",
       " 369: '可算',\n",
       " 370: '华语',\n",
       " 371: '顶尖',\n",
       " 372: '存在',\n",
       " 373: '驱逐舰',\n",
       " 374: '导弹',\n",
       " 375: '坦克',\n",
       " 376: '商业片',\n",
       " 377: '狂',\n",
       " 378: '镜头',\n",
       " 379: '运用',\n",
       " 380: '笑',\n",
       " 381: '点',\n",
       " 382: '插入',\n",
       " 383: '好莱坞',\n",
       " 384: '爆米花',\n",
       " 385: '功',\n",
       " 386: '不过',\n",
       " 387: '从头',\n",
       " 388: '打到',\n",
       " 389: '尾',\n",
       " 390: '拼',\n",
       " 391: '虽然',\n",
       " 392: '有略',\n",
       " 393: '乱',\n",
       " 394: '时',\n",
       " 395: '因为',\n",
       " 396: '没',\n",
       " 397: '啥',\n",
       " 398: '期望值',\n",
       " 399: '被',\n",
       " 400: '吓了一跳',\n",
       " 401: '吴刚',\n",
       " 402: '谦和',\n",
       " 403: '丁海峰',\n",
       " 404: '老',\n",
       " 405: '三位',\n",
       " 406: '炖',\n",
       " 407: '烂熟',\n",
       " 408: '牛筋',\n",
       " 409: '嚼',\n",
       " 410: '用心',\n",
       " 411: '啊',\n",
       " 412: '导演',\n",
       " 413: '小看',\n",
       " 414: '确实',\n",
       " 415: '下功夫',\n",
       " 416: '拉',\n",
       " 417: '借鉴',\n",
       " 418: '至于',\n",
       " 419: '大家',\n",
       " 420: '比较',\n",
       " 421: '反感',\n",
       " 422: '情绪',\n",
       " 423: '那些',\n",
       " 424: '桥段',\n",
       " 425: '必备',\n",
       " 426: '稍微',\n",
       " 427: '一点',\n",
       " 428: '还',\n",
       " 429: '可以',\n",
       " 430: '接受',\n",
       " 431: '最好',\n",
       " 432: '地方',\n",
       " 433: '掌握',\n",
       " 434: '张弛',\n",
       " 435: '有度',\n",
       " 436: '这点',\n",
       " 437: '难得',\n",
       " 438: '一直',\n",
       " 439: '脑子里',\n",
       " 440: '回响',\n",
       " 441: '片头',\n",
       " 442: '海里',\n",
       " 443: '那场',\n",
       " 444: '完',\n",
       " 445: '呆',\n",
       " 446: '下去',\n",
       " 447: '太假',\n",
       " 448: '提前',\n",
       " 449: '离场',\n",
       " 450: '好看',\n",
       " 451: '演技',\n",
       " 452: '棒',\n",
       " 453: '符合',\n",
       " 454: '反而',\n",
       " 455: '更',\n",
       " 456: '差',\n",
       " 457: '这一',\n",
       " 458: '放之四海而皆准',\n",
       " 459: '规律',\n",
       " 460: '场面',\n",
       " 461: '越做越',\n",
       " 462: '然而',\n",
       " 463: '伴随',\n",
       " 464: '特效',\n",
       " 465: '升级',\n",
       " 466: '叙事',\n",
       " 467: '变得',\n",
       " 468: '非常',\n",
       " 469: '凌乱',\n",
       " 470: '格局',\n",
       " 471: '颇',\n",
       " 472: '拍成',\n",
       " 473: '黑鹰坠落',\n",
       " 474: '结果',\n",
       " 475: '撑',\n",
       " 476: '死',\n",
       " 477: '最多',\n",
       " 478: '只是',\n",
       " 479: '官方',\n",
       " 480: '版',\n",
       " 481: '敢死队',\n",
       " 482: '但论',\n",
       " 483: '自我',\n",
       " 484: '角色定位',\n",
       " 485: '能力',\n",
       " 486: '远',\n",
       " 487: '如同',\n",
       " 488: '演员',\n",
       " 489: '出身',\n",
       " 490: '甄子丹',\n",
       " 491: '喜欢',\n",
       " 492: '不是',\n",
       " 493: '装傻',\n",
       " 494: '真傻',\n",
       " 495: '要不是',\n",
       " 496: '真的',\n",
       " 497: '别的',\n",
       " 498: '可',\n",
       " 499: '肯定',\n",
       " 500: '选',\n",
       " 501: '直',\n",
       " 502: '男',\n",
       " 503: '癌',\n",
       " 504: '令人发指',\n",
       " 505: '所有',\n",
       " 506: '剧情',\n",
       " 507: '走向',\n",
       " 508: '九十年代',\n",
       " 509: '那套',\n",
       " 510: '照搬',\n",
       " 511: '审美',\n",
       " 512: '事儿',\n",
       " 513: '一时',\n",
       " 514: '会儿',\n",
       " 515: '培养',\n",
       " 516: '出来',\n",
       " 517: '整部',\n",
       " 518: '延续',\n",
       " 519: '风格',\n",
       " 520: '热血',\n",
       " 521: '要',\n",
       " 522: '不错',\n",
       " 523: '适合',\n",
       " 524: '演',\n",
       " 525: '军人',\n",
       " 526: '之前',\n",
       " 527: '片段',\n",
       " 528: '念',\n",
       " 529: '劲儿',\n",
       " 530: '来说',\n",
       " 531: '张翰',\n",
       " 532: '一',\n",
       " 533: '一股',\n",
       " 534: '雷阵雨',\n",
       " 535: '画风',\n",
       " 536: '目',\n",
       " 537: '瞪',\n",
       " 538: '狗',\n",
       " 539: '瘠薄',\n",
       " 540: '人牛',\n",
       " 541: 'b',\n",
       " 542: '硬道理',\n",
       " 543: '隔壁',\n",
       " 544: '建军',\n",
       " 545: '大爷',\n",
       " 546: '你们',\n",
       " 547: '场景',\n",
       " 548: '战斗',\n",
       " 549: '全线',\n",
       " 550: '打斗',\n",
       " 551: '游走',\n",
       " 552: '审查',\n",
       " 553: '红线',\n",
       " 554: '边界',\n",
       " 555: '政治',\n",
       " 556: '安全',\n",
       " 557: '缝隙',\n",
       " 558: '部',\n",
       " 559: '极具',\n",
       " 560: '煽动',\n",
       " 561: '大片',\n",
       " 562: '制作',\n",
       " 563: '精良',\n",
       " 564: '影片',\n",
       " 565: '请',\n",
       " 566: '多来',\n",
       " 567: '胶卷',\n",
       " 568: '过度',\n",
       " 569: '部队',\n",
       " 570: '没太多',\n",
       " 571: '展示',\n",
       " 572: '死去',\n",
       " 573: '反正',\n",
       " 574: '吸引',\n",
       " 575: '冲',\n",
       " 576: '为什么',\n",
       " 577: '鄙视',\n",
       " 578: '敢',\n",
       " 579: '开拓',\n",
       " 580: '允许',\n",
       " 581: '他们',\n",
       " 582: '再',\n",
       " 583: '直到',\n",
       " 584: '更好',\n",
       " 585: '拍出',\n",
       " 586: '出彩',\n",
       " 587: '呢',\n",
       " 588: '火爆',\n",
       " 589: '本片',\n",
       " 590: '必将',\n",
       " 591: '燃爆',\n",
       " 592: '暑期',\n",
       " 593: '厉害',\n",
       " 594: '身为',\n",
       " 595: '武打',\n",
       " 596: '高标准',\n",
       " 597: '枪战',\n",
       " 598: '为',\n",
       " 599: '点赞',\n",
       " 600: '热血男儿',\n",
       " 601: '荷尔蒙',\n",
       " 602: '爆发',\n",
       " 603: '给',\n",
       " 604: '星',\n",
       " 605: '血战',\n",
       " 606: '钢锯',\n",
       " 607: '岭',\n",
       " 608: '会',\n",
       " 609: '歌颂',\n",
       " 610: '宗教',\n",
       " 611: '情怀',\n",
       " 612: '超越',\n",
       " 613: '政权',\n",
       " 614: '当',\n",
       " 615: '只',\n",
       " 616: '明显',\n",
       " 617: '低',\n",
       " 618: '层次',\n",
       " 619: '充满',\n",
       " 620: '现实',\n",
       " 621: '乃至',\n",
       " 622: '投机',\n",
       " 623: '考量',\n",
       " 624: '高下',\n",
       " 625: '立',\n",
       " 626: '见',\n",
       " 627: '请问',\n",
       " 628: '脑',\n",
       " 629: '残',\n",
       " 630: '火箭炮',\n",
       " 631: '吗',\n",
       " 632: '傲气',\n",
       " 633: '雄鹰',\n",
       " 634: '第一',\n",
       " 635: '滴血',\n",
       " 636: '算是',\n",
       " 637: '国内',\n",
       " 638: '准',\n",
       " 639: '钱',\n",
       " 640: '花',\n",
       " 641: '有效',\n",
       " 642: '气魄',\n",
       " 643: '创作',\n",
       " 644: '足够',\n",
       " 645: '真诚',\n",
       " 646: '人物',\n",
       " 647: '连',\n",
       " 648: '可爱',\n",
       " 649: '如果',\n",
       " 650: '当年',\n",
       " 651: '那样',\n",
       " 652: '膨胀',\n",
       " 653: '银幕',\n",
       " 654: '独占',\n",
       " 655: '聚光灯',\n",
       " 656: '走',\n",
       " 657: '扪心自问',\n",
       " 658: '没法',\n",
       " 659: '评价',\n",
       " 660: '全片',\n",
       " 661: '靠',\n",
       " 662: '文戏',\n",
       " 663: '扯淡',\n",
       " 664: '女主角',\n",
       " 665: '毫无',\n",
       " 666: '必要',\n",
       " 667: '只要',\n",
       " 668: '开挂',\n",
       " 669: '牛',\n",
       " 670: '逼',\n",
       " 671: '之处',\n",
       " 672: '在于',\n",
       " 673: '透露',\n",
       " 674: '极',\n",
       " 675: '强烈',\n",
       " 676: '意识形态',\n",
       " 677: '枷锁',\n",
       " 678: '祖国',\n",
       " 679: '面前',\n",
       " 680: '一切',\n",
       " 681: '反动派',\n",
       " 682: '纸老虎',\n",
       " 683: '人开',\n",
       " 684: '挂',\n",
       " 685: '团灭',\n",
       " 686: '合情合理',\n",
       " 687: '两星',\n",
       " 688: '鼓励',\n",
       " 689: '其他',\n",
       " 690: '般',\n",
       " 691: '看点',\n",
       " 692: '有点',\n",
       " 693: '手接',\n",
       " 694: '哈哈哈',\n",
       " 695: '从',\n",
       " 696: '之后',\n",
       " 697: '炸',\n",
       " 698: '翻',\n",
       " 699: '一下',\n",
       " 700: '四星',\n",
       " 701: '当时',\n",
       " 702: '其实',\n",
       " 703: '完成度',\n",
       " 704: '接近',\n",
       " 705: '每个',\n",
       " 706: '步骤',\n",
       " 707: '顺滑',\n",
       " 708: '任何',\n",
       " 709: '出人意料',\n",
       " 710: '是因为',\n",
       " 711: '看看',\n",
       " 712: '最近',\n",
       " 713: '世界',\n",
       " 714: '抱歉',\n",
       " 715: '影院',\n",
       " 716: '起来',\n",
       " 717: '魔幻',\n",
       " 718: '当然',\n",
       " 719: '强拆',\n",
       " 720: '现实感',\n",
       " 721: '一幕',\n",
       " 722: '开场',\n",
       " 723: '搏斗',\n",
       " 724: '从来',\n",
       " 725: '其它',\n",
       " 726: '拍摄',\n",
       " 727: '难度',\n",
       " 728: '同时',\n",
       " 729: '技能',\n",
       " 730: '方面',\n",
       " 731: '要求',\n",
       " 732: '回来',\n",
       " 733: '搜',\n",
       " 734: '游泳',\n",
       " 735: '潜水',\n",
       " 736: '滑雪',\n",
       " 737: '飞机',\n",
       " 738: '射击',\n",
       " 739: '各项',\n",
       " 740: '特意',\n",
       " 741: '特种部队',\n",
       " 742: '当过',\n",
       " 743: '月',\n",
       " 744: '兵',\n",
       " 745: '佩服',\n",
       " 746: '这样',\n",
       " 747: '星半',\n",
       " 748: '结束',\n",
       " 749: '掌声',\n",
       " 750: '出现',\n",
       " 751: '近期',\n",
       " 752: '少见',\n",
       " 753: '一粒',\n",
       " 754: '大补丸',\n",
       " 755: '有人',\n",
       " 756: '吃',\n",
       " 757: '开心',\n",
       " 758: '补大',\n",
       " 759: '从白',\n",
       " 760: '黑',\n",
       " 761: '字幕',\n",
       " 762: '展现',\n",
       " 763: '超级',\n",
       " 764: '糙',\n",
       " 765: '猛',\n",
       " 766: '媲美',\n",
       " 767: '终结者',\n",
       " 768: '无',\n",
       " 769: '亮点',\n",
       " 770: '变',\n",
       " 771: '谐星',\n",
       " 772: '掌控',\n",
       " 773: '逼近',\n",
       " 774: '不住',\n",
       " 775: '边缘',\n",
       " 776: '带',\n",
       " 777: '感',\n",
       " 778: '拳拳',\n",
       " 779: '肉',\n",
       " 780: '超爽',\n",
       " 781: '聪明',\n",
       " 782: '鸡',\n",
       " 783: '贼',\n",
       " 784: '一面',\n",
       " 785: '旗下',\n",
       " 786: '呈现',\n",
       " 787: '一出',\n",
       " 788: '重工业',\n",
       " 789: '娱乐',\n",
       " 790: '调控',\n",
       " 791: '说教',\n",
       " 792: '比例',\n",
       " 793: '尺度',\n",
       " 794: '大众',\n",
       " 795: '接纳',\n",
       " 796: '把握',\n",
       " 797: '微妙',\n",
       " 798: '其中',\n",
       " 799: '一些',\n",
       " 800: '奇侠',\n",
       " 801: '化',\n",
       " 802: '内容',\n",
       " 803: '比如',\n",
       " 804: '玻璃',\n",
       " 805: '碴',\n",
       " 806: '子',\n",
       " 807: '飞镖',\n",
       " 808: '杀敌',\n",
       " 809: '一类',\n",
       " 810: '只不过',\n",
       " 811: '遮盖',\n",
       " 812: '掉',\n",
       " 813: '老爹',\n",
       " 814: '演过',\n",
       " 815: '美剧',\n",
       " 816: '搏击',\n",
       " 817: '王国',\n",
       " 818: '力荐',\n",
       " 819: '那部',\n",
       " 820: '为啥',\n",
       " 821: '奇异',\n",
       " 822: '恩典',\n",
       " 823: '配乐',\n",
       " 824: '画',\n",
       " 825: '内',\n",
       " 826: '男生',\n",
       " 827: '的话',\n",
       " 828: '应该',\n",
       " 829: '刺激',\n",
       " 830: '肾上腺素',\n",
       " 831: '女生',\n",
       " 832: '对龙',\n",
       " 833: '小云',\n",
       " 834: '感情',\n",
       " 835: '十分',\n",
       " 836: '打动',\n",
       " 837: '模仿',\n",
       " 838: '许多',\n",
       " 839: '怎么',\n",
       " 840: '玩',\n",
       " 841: '一股脑',\n",
       " 842: '堆',\n",
       " 843: '槽',\n",
       " 844: '几位',\n",
       " 845: '血',\n",
       " 846: '厚到',\n",
       " 847: '科幻',\n",
       " 848: '级别',\n",
       " 849: '重复',\n",
       " 850: '满血',\n",
       " 851: '红血',\n",
       " 852: '中毒',\n",
       " 853: '极速',\n",
       " 854: '回血',\n",
       " 855: '爆',\n",
       " 856: '种',\n",
       " 857: '打通',\n",
       " 858: '全场',\n",
       " 859: '太过',\n",
       " 860: '投机取巧',\n",
       " 861: '穿',\n",
       " 862: '迈克尔',\n",
       " 863: '贝都',\n",
       " 864: '不受',\n",
       " 865: '待见',\n",
       " 866: '国片',\n",
       " 867: '前仆后继',\n",
       " 868: '爆炸',\n",
       " 869: 'high',\n",
       " 870: '瞎',\n",
       " 871: '没用',\n",
       " 872: '女人',\n",
       " 873: '缺',\n",
       " 874: '男人',\n",
       " 875: '征服',\n",
       " 876: '美国',\n",
       " 877: '不行',\n",
       " 878: '全都',\n",
       " 879: '跟',\n",
       " 880: '跳墙',\n",
       " 881: '一样',\n",
       " 882: '拯救',\n",
       " 883: '国产片',\n",
       " 884: '以',\n",
       " 885: '中印',\n",
       " 886: '局势',\n",
       " 887: '对比',\n",
       " 888: '假想',\n",
       " 889: '真是',\n",
       " 890: '讽刺',\n",
       " 891: '谄媚',\n",
       " 892: '军旅',\n",
       " 893: '题材',\n",
       " 894: '质感',\n",
       " 895: '国外',\n",
       " 896: '精彩',\n",
       " 897: '看着',\n",
       " 898: '有力',\n",
       " 899: '必须',\n",
       " 900: '安利',\n",
       " 901: '张',\n",
       " 902: '翰',\n",
       " 903: '简直',\n",
       " 904: '承包',\n",
       " 905: '笑点',\n",
       " 906: '量身定做',\n",
       " 907: '彭于',\n",
       " 908: '晏',\n",
       " 909: '可演',\n",
       " 910: '不来',\n",
       " 911: '不少',\n",
       " 912: '漂移',\n",
       " 913: '无人机',\n",
       " 914: '突袭',\n",
       " 915: '直升机',\n",
       " 916: '坠',\n",
       " 917: '露',\n",
       " 918: '肉搏',\n",
       " 919: '军舰',\n",
       " 920: '发射',\n",
       " 921: '叛乱',\n",
       " 922: '国际化',\n",
       " 923: '视角',\n",
       " 924: '标配',\n",
       " 925: '饰演',\n",
       " 926: '深入人心',\n",
       " 927: '搏命',\n",
       " 928: '精神',\n",
       " 929: '当下',\n",
       " 930: '第三部',\n",
       " 931: '表白',\n",
       " 932: '典型',\n",
       " 933: '方式',\n",
       " 934: '每次',\n",
       " 935: '猜',\n",
       " 936: '诶',\n",
       " 937: '问',\n",
       " 938: '王牌',\n",
       " 939: '特工',\n",
       " 940: '那么',\n",
       " 941: '杀人',\n",
       " 942: '经过',\n",
       " 943: '艺术',\n",
       " 944: '处理',\n",
       " 945: '直接',\n",
       " 946: '删',\n",
       " 947: '血腥',\n",
       " 948: '屠杀',\n",
       " 949: '赤裸裸',\n",
       " 950: '大段',\n",
       " 951: '正确',\n",
       " 952: '庇',\n",
       " 953: '衣',\n",
       " 954: '意料之中',\n",
       " 955: '意料之外',\n",
       " 956: '惊喜',\n",
       " 957: '属于',\n",
       " 958: '狼性',\n",
       " 959: '军魂',\n",
       " 960: '几个',\n",
       " 961: '网红',\n",
       " 962: '弹弹琴',\n",
       " 963: '大国',\n",
       " 964: '气象',\n",
       " 965: '满屏',\n",
       " 966: '告诉',\n",
       " 967: '吴',\n",
       " 968: '迪塞尔',\n",
       " 969: '如入无人之境',\n",
       " 970: '亿',\n",
       " 971: '大陆',\n",
       " 972: '一刻',\n",
       " 973: '集体',\n",
       " 974: '勃起',\n",
       " 975: '离开',\n",
       " 976: '影厅',\n",
       " 977: '屌丝',\n",
       " 978: '同样',\n",
       " 979: '开始',\n",
       " 980: '前',\n",
       " 981: '屌',\n",
       " 982: '一万倍',\n",
       " 983: '一次',\n",
       " 984: '标准',\n",
       " 985: '打造',\n",
       " 986: '美式',\n",
       " 987: '不可逆转',\n",
       " 988: '缺点',\n",
       " 989: '笑料',\n",
       " 990: '一定',\n",
       " 991: '程度',\n",
       " 992: '地',\n",
       " 993: '破坏',\n",
       " 994: '节奏感',\n",
       " 995: '斥',\n",
       " 996: '巨资',\n",
       " 997: '炮制',\n",
       " 998: '有所',\n",
       " 999: '体验',\n",
       " ...}"
      ]
     },
     "execution_count": 155,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index2word = {ind: w for w, ind in word2index.items()}\n",
    "index2word"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 将文本转化成向量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T07:01:08.183408Z",
     "start_time": "2020-05-14T07:01:08.178013Z"
    }
   },
   "outputs": [],
   "source": [
    "def sent2vec(sent):\n",
    "    return [word2index[w] for w in sent]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T07:01:47.201932Z",
     "start_time": "2020-05-14T07:01:47.196350Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['吴京', '的', '炒作', '水平', '不输', '冯小刚', '但', '小刚', '至少', '不会', '用', '主旋律', '来', '炒作', '吴京', '让', '人', '看', '了', '不', '舒服', '为了', '主旋律', '而', '主旋律', '为了', '煽情', '而', '煽情', '让', '人', '觉得', '他', '是', '个', '大', '做作', '大', '谎言', '家', '更新', '片子', '整体', '不如', '湄公河', '行动', '整体', '不够', '流畅', '编剧', '有毒', '台词', '尴尬', '刻意', '做作', '的', '主旋律', '煽情', '显得', '如此', '不合时宜', '而', '又', '多余']\n",
      "[1, 6, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 43, 1, 54, 55, 8, 4, 56, 57, 58, 52, 59, 52, 58, 60, 59, 60, 54, 55, 61, 22, 62, 63, 64, 65, 64, 66, 67, 68, 69, 70, 71, 72, 73, 70, 74, 75, 76, 77, 78, 79, 80, 65, 6, 52, 60, 81, 82, 83, 59, 84, 85]\n"
     ]
    }
   ],
   "source": [
    "print(dataset['cleaned_comment'].iloc[2])\n",
    "\n",
    "print(sent2vec(dataset['cleaned_comment'].iloc[2]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将句子向量转化成相同长度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T10:11:05.645880Z",
     "start_time": "2020-05-14T10:10:51.368570Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2382890, 100)"
      ]
     },
     "execution_count": 219,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = [sent2vec(sent) for sent in dataset['cleaned_comment']]\n",
    "\n",
    "maxlen = 100\n",
    "X = sequence.pad_sequences(X, maxlen=maxlen, padding='post', truncating='post')\n",
    "\n",
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T07:14:56.995453Z",
     "start_time": "2020-05-14T07:14:56.989443Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2382890,)"
      ]
     },
     "execution_count": 178,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = dataset['star'].values\n",
    "y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T07:14:58.058623Z",
     "start_time": "2020-05-14T07:14:58.018317Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2382890, 5)"
      ]
     },
     "execution_count": 179,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def to_categorical(y):\n",
    "    y = np.array(y, dtype='int')\n",
    "    n = len(y)\n",
    "    m = np.max(y)\n",
    "    categorical = np.zeros((n, m))\n",
    "    categorical[np.arange(n), y - 1] = 1\n",
    "    return categorical\n",
    "\n",
    "\n",
    "y = to_categorical(y)\n",
    "\n",
    "y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T07:15:03.559987Z",
     "start_time": "2020-05-14T07:15:03.544456Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4 star comments: 30.43%\n",
      "5 star comments: 29.24%\n",
      "3 star comments: 22.63%\n",
      "1 star comments: 9.01%\n",
      "2 star comments: 8.69%\n"
     ]
    }
   ],
   "source": [
    "for label, counts in dataset['star'].value_counts().items():\n",
    "    print(f\"{label} star comments: {counts*100/len(dataset['star']):.2f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T07:20:53.623613Z",
     "start_time": "2020-05-14T07:20:51.391932Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.19980611778134955\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1       0.09      0.20      0.12    214614\n",
      "           2       0.09      0.20      0.12    207126\n",
      "           3       0.23      0.20      0.21    539341\n",
      "           4       0.30      0.20      0.24    725081\n",
      "           5       0.29      0.20      0.24    696728\n",
      "\n",
      "    accuracy                           0.20   2382890\n",
      "   macro avg       0.20      0.20      0.19   2382890\n",
      "weighted avg       0.25      0.20      0.21   2382890\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 随机评分的准确性\n",
    "\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "test_y = dataset['star'].values\n",
    "test_y_pred = np.random.randint(1, 6, size=(len(y)))\n",
    "print(classification_report(test_y, test_y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 嵌入矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T07:08:34.251494Z",
     "start_time": "2020-05-14T07:08:33.918119Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(163733, 50)"
      ]
     },
     "execution_count": 172,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embed_size = 50\n",
    "nb_words = len(vocab) + 1\n",
    "embedding_matrix = np.zeros((nb_words, embed_size))\n",
    "for word, i in word2index.items():\n",
    "    if word in wv:\n",
    "        vector = wv[word]\n",
    "        embedding_matrix[i] = vector\n",
    "\n",
    "embedding_matrix.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 创建模型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 基准模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T07:22:16.423928Z",
     "start_time": "2020-05-14T07:22:14.847867Z"
    }
   },
   "outputs": [],
   "source": [
    "inp = Input(shape=(maxlen, ))\n",
    "x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)\n",
    "x = Bidirectional(\n",
    "    LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)\n",
    "x = GlobalMaxPool1D()(x)\n",
    "x = Dense(50, activation=\"relu\")(x)\n",
    "x = Dropout(0.1)(x)\n",
    "x = Dense(5, activation='softmax')(x)\n",
    "model = Model(inputs=inp, outputs=x)\n",
    "model.compile(loss='categorical_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T08:17:22.147031Z",
     "start_time": "2020-05-14T07:22:24.985060Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 1787167 samples, validate on 595723 samples\n",
      "Epoch 1/2\n",
      "1787167/1787167 [==============================] - 1647s 921us/sample - loss: 1.1571 - accuracy: 0.4849 - val_loss: 1.0927 - val_accuracy: 0.5085\n",
      "Epoch 2/2\n",
      "1787167/1787167 [==============================] - 1650s 923us/sample - loss: 1.0769 - accuracy: 0.5232 - val_loss: 1.0612 - val_accuracy: 0.5321\n"
     ]
    }
   ],
   "source": [
    "history = model.fit(X, y, epochs=2, batch_size=128, validation_split=0.25)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T09:16:17.897962Z",
     "start_time": "2020-05-14T09:16:17.895659Z"
    }
   },
   "source": [
    "> 模型性能不佳，可能原因，在于语料中：部分太过主观，不同人的三星可能不同的含义；部分评论基本无意义，和评分没有对应关系"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 五分类转化成三分类\n",
    "- 判断是中立、正面、还是负面"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T08:19:33.770896Z",
     "start_time": "2020-05-14T08:19:33.323450Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2382890, 3)"
      ]
     },
     "execution_count": 191,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def to3cat(x):\n",
    "    if x < 3:\n",
    "        x = 1  # 烂片\n",
    "    elif x == 3:\n",
    "        x = 2  # 普通\n",
    "    else:\n",
    "        x = 3  # 好片\n",
    "    return x\n",
    "\n",
    "y = dataset['star'].apply(to3cat)\n",
    "\n",
    "y = to_categorical(y)\n",
    "y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T08:19:43.955894Z",
     "start_time": "2020-05-14T08:19:43.562694Z"
    }
   },
   "outputs": [],
   "source": [
    "inp = Input(shape=(maxlen, ))\n",
    "x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)\n",
    "x = Bidirectional(\n",
    "    LSTM(200, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)\n",
    "x = GlobalMaxPool1D()(x)\n",
    "x = Dense(50, activation=\"relu\")(x)\n",
    "x = Dropout(0.1)(x)\n",
    "x = Dense(3, activation='softmax')(x)\n",
    "model = Model(inputs=inp, outputs=x)\n",
    "model.compile(loss='categorical_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T09:16:17.461298Z",
     "start_time": "2020-05-14T08:19:46.947102Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 1787167 samples, validate on 595723 samples\n",
      "Epoch 1/2\n",
      "1787167/1787167 [==============================] - 1696s 949us/sample - loss: 0.6739 - accuracy: 0.7070 - val_loss: 0.5859 - val_accuracy: 0.7609\n",
      "Epoch 2/2\n",
      "1787167/1787167 [==============================] - 1694s 948us/sample - loss: 0.6134 - accuracy: 0.7359 - val_loss: 0.5724 - val_accuracy: 0.7631\n"
     ]
    }
   ],
   "source": [
    "history = model.fit(X, y, epochs=2, batch_size=128, validation_split=0.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 二分类\n",
    "- 只判断是正面，还是负面，中性的删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 234,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T10:15:20.180518Z",
     "start_time": "2020-05-14T10:15:19.656848Z"
    }
   },
   "outputs": [],
   "source": [
    "def to2cat(x):\n",
    "    if x < 3:\n",
    "        x = 0  # neg\n",
    "    elif x > 3:\n",
    "        x = 1  # pos\n",
    "    else:\n",
    "        x = -1 # get rid of\n",
    "\n",
    "    return x\n",
    "\n",
    "\n",
    "dataset['sentiment'] = dataset['star'].apply(to2cat)\n",
    "\n",
    "\n",
    "mask = dataset['sentiment'] != -1\n",
    "\n",
    "y = dataset['sentiment'][mask].values\n",
    "\n",
    "X = X[mask]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 241,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T10:16:08.586350Z",
     "start_time": "2020-05-14T10:16:08.197504Z"
    }
   },
   "outputs": [],
   "source": [
    "inp = Input(shape=(maxlen, ))\n",
    "x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)\n",
    "x = Bidirectional(\n",
    "    LSTM(200, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)\n",
    "x = GlobalMaxPool1D()(x)\n",
    "x = Dense(50, activation=\"relu\")(x)\n",
    "x = Dropout(0.1)(x)\n",
    "x = Dense(1, activation='softmax')(x)\n",
    "model = Model(inputs=inp, outputs=x)\n",
    "model.compile(loss='binary_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 242,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-14T11:00:34.046062Z",
     "start_time": "2020-05-14T10:16:09.423234Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 1382661 samples, validate on 460888 samples\n",
      "Epoch 1/2\n",
      "1382661/1382661 [==============================] - 1331s 963us/sample - loss: 3.5704 - accuracy: 0.7671 - val_loss: 3.3198 - val_accuracy: 0.7835\n",
      "Epoch 2/2\n",
      "1382661/1382661 [==============================] - 1333s 964us/sample - loss: 3.5704 - accuracy: 0.7671 - val_loss: 3.3198 - val_accuracy: 0.7835\n"
     ]
    }
   ],
   "source": [
    "history = model.fit(X, y, epochs=2, batch_size=128, validation_split=0.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 使用性能更好的词向量\n",
    "腾讯词向量：https://github.com/cliuxinxin/TX-WORD2VEC-SMALL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "jieba.load_userdict(\"70000-dict.txt\")\n",
    "\n",
    "embeddings_index = {}\n",
    "with open('70000-small.txt','r') as f:\n",
    "    for i,line in enumerate(f):\n",
    "        if i == 0:\n",
    "            continue\n",
    "        values = line.split()\n",
    "        word = values[0]\n",
    "        coefs = np.asarray(values[1:], dtype='float32')\n",
    "        embeddings_index[word] = coefs\n",
    "print('Found %s word vectors.' % len(embeddings_index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "position": {
    "height": "631.719px",
    "left": "1375.45px",
    "right": "20px",
    "top": "120px",
    "width": "350px"
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
